## **Feature Engineering Notebook**

This Notebook is made for developing features from the existing raw data to train a *Classification Model* for predicting whether the food will be delivered on time or not.

In [92]:
# Importing libraries for feature engineering.
import pandas as pd
import numpy as np

In [93]:
# Reading processed data and converting it into dataFrame.
df = pd.read_csv('../data/processed/cleaned_food_delivery_data.csv')

# Displaying the first 5 rows via head method.
print(df.head())

   Distance_km  Weather  Traffic_Level  Vehicle_Type  Preparation_Time_min  \
0         7.93      1.0            0.0           1.0                    12   
1        16.42      0.0            1.0           2.0                    20   
2         9.52      2.0            0.0           1.0                    28   
3         7.44      3.0            1.0           1.0                     5   
4        19.03      0.0            0.0           2.0                    16   

   Courier_Experience_yrs  Delivery_Time_min  Time_of_Day_Afternoon  \
0                     1.0                 43                    1.0   
1                     2.0                 84                    0.0   
2                     1.0                 59                    0.0   
3                     1.0                 37                    1.0   
4                     5.0                 68                    0.0   

   Time_of_Day_Evening  Time_of_Day_Morning  Time_of_Day_Night  
0                  0.0                 

In [94]:
#Verifying Columns in the dataset
print(df.columns)

Index(['Distance_km', 'Weather', 'Traffic_Level', 'Vehicle_Type',
       'Preparation_Time_min', 'Courier_Experience_yrs', 'Delivery_Time_min',
       'Time_of_Day_Afternoon', 'Time_of_Day_Evening', 'Time_of_Day_Morning',
       'Time_of_Day_Night'],
      dtype='object')


# **Calculating Basetime**

We are calculating the Base Time that will require to deliver the food to the cusomers.

In [95]:
baseTime = df['Preparation_Time_min'] + (3 * df['Distance_km'])
print(baseTime.head())

0    35.79
1    69.26
2    56.56
3    27.32
4    73.09
dtype: float64


In [96]:
# Adding Base_Time column to the dataframe
df['Base_Time'] = baseTime

#Verifying column results:
print(df.head())

   Distance_km  Weather  Traffic_Level  Vehicle_Type  Preparation_Time_min  \
0         7.93      1.0            0.0           1.0                    12   
1        16.42      0.0            1.0           2.0                    20   
2         9.52      2.0            0.0           1.0                    28   
3         7.44      3.0            1.0           1.0                     5   
4        19.03      0.0            0.0           2.0                    16   

   Courier_Experience_yrs  Delivery_Time_min  Time_of_Day_Afternoon  \
0                     1.0                 43                    1.0   
1                     2.0                 84                    0.0   
2                     1.0                 59                    0.0   
3                     1.0                 37                    1.0   
4                     5.0                 68                    0.0   

   Time_of_Day_Evening  Time_of_Day_Morning  Time_of_Day_Night  Base_Time  
0                  0.0      

# **Adding Traffic Factor**

In [97]:
# Adding Traffic Factor 1.0 for low, 1.2 for medium and 1.50 for hgih
conditions = [
    (df.Traffic_Level == 0),
    (df.Traffic_Level == 1),
    (df.Traffic_Level == 2)
]

choices = [
    (df.Base_Time * 1.0),
    (df.Base_Time * 1.2),
    (df.Base_Time * 1.5)
]

expectedTime = np.select(conditions, choices, default=df.Base_Time * 1.0)


In [98]:
print(expectedTime)

[ 35.79   83.112  56.56   32.784  73.09   66.2    40.56   68.604  25.34
  60.86   76.296  75.62   67.005  31.284  27.6    21.51   58.58   13.59
  25.74   50.568  74.68   55.24   42.87   66.735  67.92   70.524  59.676
  59.76   23.12   76.98   55.968 117.525  53.61  103.428  41.292  83.28
  47.856  67.86   33.672  41.424  40.848  68.484  21.564   8.33   21.48
  66.612  40.37   36.79   29.27   43.71   47.472  39.49   95.316  38.3
  32.92   47.05   43.7    22.7    60.888  89.22   63.53   71.22   53.82
  41.88   34.55   47.2    65.93   28.308  72.816  81.36   76.035  17.51
  38.445  35.14   55.62   45.06   55.82   41.24   40.056  40.08   75.768
  32.75   71.808  54.42   40.38   65.94   83.09   35.9    42.09   49.29
  97.92   72.255  65.23   84.765  36.96   13.76   29.45   98.1    36.456
  47.7    93.72   98.4    28.296  85.536  54.792  32.52   76.77   92.505
  31.08  100.455  28.8    39.78   63.72   67.308  68.19   82.02   42.708
  66.648  89.4    23.928  69.684  87.99   33.59   74.964  26

# **Adding Weather Factor**

In [99]:
conditions1 = [
    (df.Weather == 0),
    (df.Weather == 1),
    (df.Weather == 2),
    (df.Weather == 3),
    (df.Weather == 4)
]

choices1 = [
    (expectedTime * 1.0),
    (expectedTime * 1.1),
    (expectedTime * 1.2),
    (expectedTime * 1.3),
    (expectedTime * 1.5)
]

expectedTime = np.select(conditions1, choices1, default=expectedTime)

print(expectedTime)

[ 39.369   83.112   67.872   42.6192  73.09    66.2     40.56    68.604
  38.01    73.032  114.444   75.62    67.005   40.6692  27.6     32.265
  76.154   20.385   30.888   50.568   97.084   71.812   42.87    66.735
  67.92    70.524   65.6436  89.64    34.68    76.98    55.968  152.7825
  53.61   124.1136  45.4212 124.92    47.856   74.646   33.672   53.8512
  53.1024  68.484   21.564    8.33    32.22    73.2732  60.555   36.79
  38.051   43.71    61.7136  51.337  142.974   49.79    36.212   47.05
  56.81    22.7     60.888   89.22    63.53    78.342   69.966   46.068
  34.55    47.2     72.523   28.308   72.816   81.36   114.0525  17.51
  49.9785  52.71    72.306   45.06    55.82    41.24    60.084   48.096
  98.4984  32.75    71.808   54.42    40.38    65.94    83.09    35.9
  42.09    73.935  107.712   93.9315  65.23   110.1945  48.048   13.76
  29.45   107.91    36.456   47.7     93.72   127.92    28.296   85.536
  54.792   32.52    76.77   138.7575  31.08   130.5915  34.56    59.

# **Applying Courier Experience**

Applying the rider or driver's experience to reduce the expected time.

In [100]:
# Applying 2% time reduction for every year of experience the courier has.
expectedTime = expectedTime * (1 - (df.Courier_Experience_yrs * 0.02))
expectedTime

0       38.581620
1       79.787520
2       66.514560
3       41.766816
4       65.781000
          ...    
995     54.285000
996     60.591440
997    157.377600
998     50.510000
999     49.507920
Name: Courier_Experience_yrs, Length: 1000, dtype: float64

In [101]:
#Adding Expected Time column to the dataframe to make isLate decision.
df['Expected_Time_min'] = expectedTime

In [102]:
#Verifying changes 
df.columns

Index(['Distance_km', 'Weather', 'Traffic_Level', 'Vehicle_Type',
       'Preparation_Time_min', 'Courier_Experience_yrs', 'Delivery_Time_min',
       'Time_of_Day_Afternoon', 'Time_of_Day_Evening', 'Time_of_Day_Morning',
       'Time_of_Day_Night', 'Base_Time', 'Expected_Time_min'],
      dtype='object')

In [103]:
df[['Delivery_Time_min', 'Expected_Time_min', 'Base_Time']].head()

Unnamed: 0,Delivery_Time_min,Expected_Time_min,Base_Time
0,43,38.58162,35.79
1,84,79.78752,69.26
2,59,66.51456,56.56
3,37,41.766816,27.32
4,68,65.781,73.09


In [104]:
df['Is_Late'] = np.where(df['Delivery_Time_min'] > df['Expected_Time_min'], 1, 0)
df['Is_Late']

0      1
1      1
2      0
3      0
4      1
      ..
995    0
996    1
997    0
998    1
999    1
Name: Is_Late, Length: 1000, dtype: int64

In [105]:
df.Is_Late.sum()

np.int64(522)

# **Saving Feature Engineered Data to CSV File for Classification Model Training.**

In [106]:
df.to_csv('../data/feature_engineered/feature_engineered_food_delivery_data.csv', index=False)
df.columns

Index(['Distance_km', 'Weather', 'Traffic_Level', 'Vehicle_Type',
       'Preparation_Time_min', 'Courier_Experience_yrs', 'Delivery_Time_min',
       'Time_of_Day_Afternoon', 'Time_of_Day_Evening', 'Time_of_Day_Morning',
       'Time_of_Day_Night', 'Base_Time', 'Expected_Time_min', 'Is_Late'],
      dtype='object')