# Problem Definition & Notes
- Objective
    - Build ML model to predict delivery time
- Core ML Problem
    - Regression Problem
- Evaluation Metric
    - RMSE
- Data Processing
    - Import Data
    - ETL
    - Perform EDA
    - Train-Test Split
    - Data cleansing
    - Feature Engineering
    - Feature Selection
    - Feature scaling/encoding
- Model Architecture
    - Baseline Model
    - ML Model
    - Loss Function
    - Cross-Validation Technique
- Offline Evaluation
    - Evaluate on Test Set (aggregated metric)
    - Evaluate ML model & baseline model
    - Evalaute various segments of data

# Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import root_mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import KFold

from statsmodels.stats.outliers_influence import variance_inflation_factor
import lightgbm as lgb
import optuna

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import sys
sys.path.append('../src')

from config import *
from utils import *

# Import Data

In [None]:
import pandas as pd

def import_data(filepath: str) -> pd.DataFrame:
    """
    Load data from CSV
    """

    try:
        df = pd.read_csv(filepath)
        print('Data loaded to Pandas dataframe')
        return df
    
    except Exception as e:
        print(f"Failed to load data: {e}")

def format_raw_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    
    """

    df_processed = df.copy()

    # format column names (lowercase, remove spaces, remove special characters)
    df_processed.columns = df_processed.columns.str.lower()
    df_processed.columns = df_processed.columns.str.replace(' ', '').str.replace('(', '_').str.replace(')', '')

    # convert column types to relevant types (str, float, datetime)
    df_processed['delivery_person_age'] = df_processed['delivery_person_age'].astype(float)
    df_processed['delivery_person_ratings'] = df_processed['delivery_person_ratings'].astype(float)
    df_processed['order_date'] = pd.to_datetime(df_processed['order_date'], errors='coerce').dt.strftime("%Y-%m-%d")
    df_processed['time_orderd'] = pd.to_datetime(df_processed['time_orderd'], errors='coerce').dt.strftime("%H:%M:%S")
    df_processed['time_order_picked'] = pd.to_datetime(df_processed['time_order_picked'], errors='coerce').dt.strftime("%H:%M:%S")
    df_processed['multiple_deliveries'] = df_processed['multiple_deliveries'].astype(float)
    df_processed['time_taken_min'] = df_processed['time_taken_min'].str.replace('(min) ', '')
    df_processed['time_taken_min'] = df_processed['time_taken_min'].astype(int)
    
    # remove spaces
    for col in df_processed.select_dtypes(object):
        df_processed[col] = df_processed[col].str.strip()

    # replace "NaN" with np.nan
    df_processed = df_processed.replace('NaN', np.nan, regex=False)

    # lower case and remove spaces in values
    lower_cols = df_processed.select_dtypes(object).drop(['id', 'delivery_person_id', 'order_date', 'time_orderd', 'time_order_picked'], axis=1).columns.tolist()
    df_processed[lower_cols] = df_processed[lower_cols].apply(lambda x: x.str.lower().str.strip().str.replace(' ', '').str.replace('-', '_'))

    # log statistics
    print(df_processed.describe())

    # log non NaN row (%)
    print(f"Non Missing Rows: {(len(df_processed.dropna())/len(df_processed)) * 100:.2f}%")

    # log outliers
    Q1 = df_processed['time_taken_min'].quantile(0.25) 
    Q3 = df_processed['time_taken_min'].quantile(0.75)
    IQR = Q3-Q1

    print(f"Outlier Upper: {len(df_processed[df_processed['time_taken_min'] < Q1 - 1.5*IQR])/len(df_processed):.2f}%")
    print(f"Outlier Lower: {len(df_processed[df_processed['time_taken_min'] > Q3 + 1.5*IQR].describe())/len(df_processed):.2f}%")

    return df_processed

def validate_schema(df: pd.DataFrame, expected_columns: dict) -> bool:
    """Check if all expected columns exist"""

    missing = set(expected_columns) - set(df.columns)
    missing_inverse = set(df.columns) - set(expected_columns)
    
    if missing:
        raise ValueError(f"Missing columns: {missing}")
    
    if missing_inverse:
        print('Extra column in dataframe')
        
    return True

def validate_dtypes(df: pd.DataFrame, expected_dtypes: dict) -> bool:
    """Check data types match expected"""

    for col, expected_type in expected_dtypes.items():
        if df[col].dtype != expected_type:
            raise TypeError(f"{col}: expected {expected_type}, got {df[col].dtype}")
        
    return True

In [4]:
df_raw_data = import_data(order_data_path_csv)
df_raw_data.head()

Data loaded to Pandas dataframe


Unnamed: 0,ID,Delivery_person_ID,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Order_Date,Time_Orderd,Time_Order_picked,Weatherconditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Time_taken(min)
0,0x4607,INDORES13DEL02,37,4.9,22.745049,75.892471,22.765049,75.912471,19-03-2022,11:30:00,11:45:00,conditions Sunny,High,2,Snack,motorcycle,0,No,Urban,(min) 24
1,0xb379,BANGRES18DEL02,34,4.5,12.913041,77.683237,13.043041,77.813237,25-03-2022,19:45:00,19:50:00,conditions Stormy,Jam,2,Snack,scooter,1,No,Metropolitian,(min) 33
2,0x5d6d,BANGRES19DEL01,23,4.4,12.914264,77.6784,12.924264,77.6884,19-03-2022,08:30:00,08:45:00,conditions Sandstorms,Low,0,Drinks,motorcycle,1,No,Urban,(min) 26
3,0x7a6a,COIMBRES13DEL02,38,4.7,11.003669,76.976494,11.053669,77.026494,05-04-2022,18:00:00,18:10:00,conditions Sunny,Medium,0,Buffet,motorcycle,1,No,Metropolitian,(min) 21
4,0x70a2,CHENRES12DEL01,32,4.6,12.972793,80.249982,13.012793,80.289982,26-03-2022,13:30:00,13:45:00,conditions Cloudy,High,1,Snack,scooter,1,No,Metropolitian,(min) 30


In [5]:
validate_schema(df_raw_data, raw_data_expected_columns)

True

In [6]:
validate_dtypes(df_raw_data, raw_data_expected_columns)

True

# ETL

In [7]:
df_etl_data = format_raw_data(df_raw_data)

  df_processed['order_date'] = pd.to_datetime(df_processed['order_date'], errors='coerce').dt.strftime("%Y-%m-%d")
  df_processed['time_orderd'] = pd.to_datetime(df_processed['time_orderd'], errors='coerce').dt.strftime("%H:%M:%S")
  df_processed['time_order_picked'] = pd.to_datetime(df_processed['time_order_picked'], errors='coerce').dt.strftime("%H:%M:%S")


       delivery_person_age  delivery_person_ratings  restaurant_latitude  \
count         43739.000000             43685.000000         45593.000000   
mean             29.567137                 4.633780            17.017729   
std               5.815155                 0.334716             8.185109   
min              15.000000                 1.000000           -30.905562   
25%              25.000000                 4.500000            12.933284   
50%              30.000000                 4.700000            18.546947   
75%              35.000000                 4.900000            22.728163   
max              50.000000                 6.000000            30.914057   

       restaurant_longitude  delivery_location_latitude  \
count          45593.000000                45593.000000   
mean              70.231332                   17.465186   
std               22.883647                    7.335122   
min              -88.366217                    0.010000   
25%               73

# Data Pre-Processing

In [8]:
def train_test_split_time_based(df: pd.DataFrame, train_percent: float) -> pd.DataFrame:
    """
    
    """

    df_processed = df.copy()
    df_processed = df_processed.sort_values(['order_date', 'time_orderd']).reset_index(drop=True)

    # Train-Test Split
    df_train = df_processed[0:int(train_percent*len(df_processed))].copy()
    df_test = df_processed[int(train_percent*len(df_processed)):].copy()

    assert len(df_train) > 0
    assert len(df_test) > 0

    return df_train, df_test

##### Train-Test Split

In [9]:
df_train, df_test = train_test_split_time_based(df_etl_data, 0.8)

In [10]:
df_train['delivery_person_ratings'] = df_train['delivery_person_ratings'].astype(float)
df_test['delivery_person_ratings'] = df_test['delivery_person_ratings'].astype(float)

##### Imput Missing Values

In [11]:
def impute_missing_values(df_train: pd.DataFrame, df_test: pd.DataFrame, mean_impute_cols: list, median_impute_cols: list, mode_impute_cols: list) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    """

    df_train_imputed = df_train.copy()
    df_test_imputed = df_test.copy()

    for col in mean_impute_cols+median_impute_cols+mode_impute_cols:
        if df_train_imputed[col].isna().all():
            print(f'{col}: All values are Nan')

    for col in mean_impute_cols:
        imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
        df_train_imputed[col] = imp_mean.fit_transform(df_train_imputed[[col]]).ravel()
        df_test_imputed[col] = imp_mean.transform(df_test_imputed[[col]]).ravel()

    for col in median_impute_cols:
        imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
        df_train_imputed[col] = imp_median.fit_transform(df_train_imputed[[col]]).ravel()
        df_test_imputed[col] = imp_median.transform(df_test_imputed[[col]]).ravel()

    for col in mode_impute_cols:
        imp_mode = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
        df_train_imputed[col] = imp_mode.fit_transform(df_train_imputed[[col]]).ravel()
        df_test_imputed[col] = imp_mode.transform(df_test_imputed[[col]]).ravel()

    assert df_train_imputed.isna().sum().sum() == 0, "Missing values in train data"
    assert df_test_imputed.isna().sum().sum() == 0, "Missing values in test data"

    return df_train_imputed, df_test_imputed

In [12]:
df_train_imputed, df_test_imputed = impute_missing_values(df_train, df_test, mean_impute_columns, median_impute_columns, mode_impute_columns)
df_test_imputed

Unnamed: 0,id,delivery_person_id,delivery_person_age,delivery_person_ratings,restaurant_latitude,restaurant_longitude,delivery_location_latitude,delivery_location_longitude,order_date,time_orderd,time_order_picked,weatherconditions,road_traffic_density,vehicle_condition,type_of_order,type_of_vehicle,multiple_deliveries,festival,city,time_taken_min
36474,0xbd01,PUNERES09DEL03,30.000000,4.6,18.536562,73.896485,18.626562,73.986485,2022-03-29,19:30:00,19:45:00,conditionscloudy,jam,1,buffet,motorcycle,0.0,no,metropolitian,46
36475,0x5980,PUNERES07DEL01,28.000000,4.0,18.546947,73.900626,18.616947,73.970626,2022-03-29,19:30:00,19:45:00,conditionswindy,jam,0,buffet,motorcycle,0.0,no,metropolitian,38
36476,0x6ff3,MUMRES18DEL01,28.000000,4.7,19.109300,72.825451,19.179300,72.895451,2022-03-29,19:30:00,19:40:00,conditionssandstorms,jam,2,buffet,motorcycle,0.0,no,urban,24
36477,0x4a6f,VADRES15DEL02,35.000000,3.5,0.000000,0.000000,0.080000,0.080000,2022-03-29,19:30:00,19:40:00,conditionswindy,jam,2,buffet,scooter,1.0,no,urban,34
36478,0x4f5d,COIMBRES03DEL01,37.000000,4.9,11.025083,77.015393,11.095083,77.085393,2022-03-29,19:30:00,19:40:00,conditionssunny,jam,1,buffet,motorcycle,1.0,no,urban,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45588,0x1396,HYDRES03DEL01,29.567968,4.7,17.422819,78.449578,17.492819,78.519578,2022-04-06,17:55:00,22:50:00,conditionsfog,low,2,drinks,scooter,1.0,no,urban,25
45589,0x4ca,MYSRES05DEL01,29.567968,4.7,12.323978,76.627961,12.433978,76.737961,2022-04-06,17:55:00,19:25:00,conditionsfog,jam,1,meal,motorcycle,1.0,no,urban,41
45590,0x4fe,SURRES04DEL03,29.567968,4.7,-21.173493,-72.801953,21.263493,72.891953,2022-04-06,17:55:00,19:15:00,conditionsstormy,jam,1,drinks,scooter,1.0,no,metropolitian,23
45591,0x576,HYDRES11DEL02,29.567968,4.7,-17.430448,78.418213,17.560448,78.548213,2022-04-06,17:55:00,22:45:00,conditionsfog,low,2,meal,scooter,1.0,no,metropolitian,37


##### Feature Engineering

In [13]:
# day of week (1-7)
# hour of day (0-23)
# distance
# km/hr per driver

In [14]:
def straight_line_distance(train: pd.DataFrame, test: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    """

    R = 6371  # Earth radius in km

    dfs = []
    for df in [train.copy(), test.copy()]:
        lat1, lon1, lat2, lon2 = map(np.radians, [df['restaurant_latitude'], df['restaurant_longitude'], df['delivery_location_latitude'], df['delivery_location_longitude']])
        dlat = lat2 - lat1
        dlon = lon2 - lon1
        
        a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
        c = 2 * np.arcsin(np.sqrt(a))

        df['distance'] = R * c
        df = df.drop(['restaurant_latitude', 'restaurant_longitude', 'delivery_location_latitude', 'delivery_location_longitude'], axis=1)

        dfs.append(df)
        
    return dfs[0], dfs[1]

In [15]:
df_train_imputed, df_test_imputed = straight_line_distance(df_train_imputed, df_test_imputed)
df_test_imputed

Unnamed: 0,id,delivery_person_id,delivery_person_age,delivery_person_ratings,order_date,time_orderd,time_order_picked,weatherconditions,road_traffic_density,vehicle_condition,type_of_order,type_of_vehicle,multiple_deliveries,festival,city,time_taken_min,distance
36474,0xbd01,PUNERES09DEL03,30.000000,4.6,2022-03-29,19:30:00,19:45:00,conditionscloudy,jam,1,buffet,motorcycle,0.0,no,metropolitian,46,13.788852
36475,0x5980,PUNERES07DEL01,28.000000,4.0,2022-03-29,19:30:00,19:45:00,conditionswindy,jam,0,buffet,motorcycle,0.0,no,metropolitian,38,10.724652
36476,0x6ff3,MUMRES18DEL01,28.000000,4.7,2022-03-29,19:30:00,19:40:00,conditionssandstorms,jam,2,buffet,motorcycle,0.0,no,urban,24,10.707675
36477,0x4a6f,VADRES15DEL02,35.000000,3.5,2022-03-29,19:30:00,19:40:00,conditionswindy,jam,2,buffet,scooter,1.0,no,urban,34,12.580268
36478,0x4f5d,COIMBRES03DEL01,37.000000,4.9,2022-03-29,19:30:00,19:40:00,conditionssunny,jam,1,buffet,motorcycle,1.0,no,urban,22,10.905989
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45588,0x1396,HYDRES03DEL01,29.567968,4.7,2022-04-06,17:55:00,22:50:00,conditionsfog,low,2,drinks,scooter,1.0,no,urban,25,10.757205
45589,0x4ca,MYSRES05DEL01,29.567968,4.7,2022-04-06,17:55:00,19:25:00,conditionsfog,jam,1,meal,motorcycle,1.0,no,urban,41,17.097973
45590,0x4fe,SURRES04DEL03,29.567968,4.7,2022-04-06,17:55:00,19:15:00,conditionsstormy,jam,1,drinks,scooter,1.0,no,metropolitian,23,16466.194147
45591,0x576,HYDRES11DEL02,29.567968,4.7,2022-04-06,17:55:00,22:45:00,conditionsfog,low,2,meal,scooter,1.0,no,metropolitian,37,3890.836127


In [16]:
def date_time_features(train: pd.DataFrame, test: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
    
    dfs = []
    for df in [train.copy(), test.copy()]:
        df['day_of_week'] = pd.to_datetime(df['order_date']).dt.day_of_week
        df['hour_of_day'] = pd.to_datetime(df['time_orderd']).dt.hour
        df = df.drop(['order_date', 'time_orderd', 'time_order_picked'], axis=1)
        dfs.append(df)
    
    return dfs[0], dfs[1]

In [17]:
df_train_imputed, df_test_imputed = date_time_features(df_train_imputed, df_test_imputed)
df_test_imputed

  df['hour_of_day'] = pd.to_datetime(df['time_orderd']).dt.hour
  df['hour_of_day'] = pd.to_datetime(df['time_orderd']).dt.hour


Unnamed: 0,id,delivery_person_id,delivery_person_age,delivery_person_ratings,weatherconditions,road_traffic_density,vehicle_condition,type_of_order,type_of_vehicle,multiple_deliveries,festival,city,time_taken_min,distance,day_of_week,hour_of_day
36474,0xbd01,PUNERES09DEL03,30.000000,4.6,conditionscloudy,jam,1,buffet,motorcycle,0.0,no,metropolitian,46,13.788852,1,19
36475,0x5980,PUNERES07DEL01,28.000000,4.0,conditionswindy,jam,0,buffet,motorcycle,0.0,no,metropolitian,38,10.724652,1,19
36476,0x6ff3,MUMRES18DEL01,28.000000,4.7,conditionssandstorms,jam,2,buffet,motorcycle,0.0,no,urban,24,10.707675,1,19
36477,0x4a6f,VADRES15DEL02,35.000000,3.5,conditionswindy,jam,2,buffet,scooter,1.0,no,urban,34,12.580268,1,19
36478,0x4f5d,COIMBRES03DEL01,37.000000,4.9,conditionssunny,jam,1,buffet,motorcycle,1.0,no,urban,22,10.905989,1,19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45588,0x1396,HYDRES03DEL01,29.567968,4.7,conditionsfog,low,2,drinks,scooter,1.0,no,urban,25,10.757205,2,17
45589,0x4ca,MYSRES05DEL01,29.567968,4.7,conditionsfog,jam,1,meal,motorcycle,1.0,no,urban,41,17.097973,2,17
45590,0x4fe,SURRES04DEL03,29.567968,4.7,conditionsstormy,jam,1,drinks,scooter,1.0,no,metropolitian,23,16466.194147,2,17
45591,0x576,HYDRES11DEL02,29.567968,4.7,conditionsfog,low,2,meal,scooter,1.0,no,metropolitian,37,3890.836127,2,17


In [18]:
def driver_statistics(train: pd.DataFrame, test: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    
    """

    df_output = train.copy()

    df_driver_speed = df_output.groupby(['delivery_person_id'], as_index=False)[['distance', 'time_taken_min']].sum()
    df_driver_speed['driver_speed'] = df_driver_speed['distance'] / df_driver_speed['time_taken_min']
    df_driver_speed = df_driver_speed[['delivery_person_id', 'driver_speed']]

    df_avg_rating = df_output.groupby('delivery_person_id', as_index=False)[['delivery_person_ratings']].mean()
    df_avg_rating = df_avg_rating.rename(columns={'delivery_person_ratings': 'avg_driver_rating'})

    dfs = []
    for df in [train.copy(), test.copy()]:

        df = pd.merge(df, df_driver_speed, how='left', on='delivery_person_id')
        df = pd.merge(df, df_avg_rating, how='left', on='delivery_person_id')

        df = df.drop(['delivery_person_ratings'], axis=1)

        dfs.append(df)

    return dfs[0], dfs[1]

In [19]:
df_train_imputed, df_test_imputed = driver_statistics(df_train_imputed, df_test_imputed)

In [20]:
assert df_train_imputed.isna().sum().sum() == 0, 'Missing Values in dataset'
assert df_test_imputed.isna().sum().sum() == 0, 'Missing Values in dataset'

##### Feature Selection

In [21]:
# features = df_train.select_dtypes([float, int]).drop(['delivery_person_ratings', 'time_taken_min', 'delivery_location_longitude', 'restaurant_longitude', 'delivery_location_latitude', 'restaurant_latitude'], axis=1)

# vif_data = pd.DataFrame()
# vif_data["Feature"] = features.columns
# vif_data["VIF"] = [variance_inflation_factor(features.values, i) 
#                    for i in range(features.shape[1])]

# # VIF > 10 indicates high collinearity
# print(vif_data.sort_values('VIF', ascending=False))

In [22]:
df_baseline_train = df_train.copy()
df_baseline_test = df_test.copy()

##### Feature Scaling/Encoding

In [23]:
def feature_scaling_encoding(train: pd.DataFrame, test: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    """

    df_train = train.copy()
    df_test = test.copy()

    # Scale
    scaler = StandardScaler()
    scaled_cols = df_train.drop(['time_taken_min'], axis=1).select_dtypes([float, int]).columns.tolist()

    df_train[scaled_cols] = scaler.fit_transform(df_train[scaled_cols])
    df_test[scaled_cols] = scaler.transform(df_test[scaled_cols])

    # Encode
    encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    encoded_cols = df_train.drop(['id', 'delivery_person_id'], axis=1).select_dtypes(object).columns.tolist()

    encoded_array = encoder.fit_transform(df_train[encoded_cols])
    features_names = encoder.get_feature_names_out()
    df_encoded = pd.DataFrame(columns=features_names, data=encoded_array)
    df_train = pd.concat([df_train.drop(encoded_cols, axis=1), df_encoded], axis=1)

    encoded_array = encoder.transform(df_test[encoded_cols])
    features_names = encoder.get_feature_names_out()
    df_encoded = pd.DataFrame(columns=features_names, data=encoded_array)
    df_test = pd.concat([df_test.drop(encoded_cols, axis=1), df_encoded], axis=1)

    return df_train, df_test

In [24]:
df_train_imputed, df_test_imputed = feature_scaling_encoding(df_train_imputed, df_test_imputed)

In [25]:
features = df_train_imputed.drop(['id', 'delivery_person_id', 'time_taken_min'], axis=1).columns.tolist()
target = ['time_taken_min']

# Model Architecture

In [26]:
def feature_target_split(train: pd.DataFrame, test: pd.DataFrame, features: list, target: list) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Output: train_x, train_y, test_x, test_y
    """

    dfs = []

    for df in [train.copy(), test.copy()]:

        df_x = df[features]
        df_y = df[target]
        df_id = df[['id']]

        dfs.append([df_x, df_y, df_id])

    return dfs[0][0], dfs[0][1], dfs[0][2], dfs[1][0], dfs[1][1], dfs[1][2]


In [27]:
df_train_x, df_train_y, df_train_id, df_test_x, df_test_y, df_test_id = feature_target_split(df_train_imputed, df_test_imputed, features, target)

##### Baseline Model

In [28]:
def baseline_model(train: pd.DataFrame, test: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    """

    df_avg_time_city = train.groupby('city', as_index=False)[['time_taken_min']].mean().rename(columns={'time_taken_min': 'baseline_time_taken_min'})

    df_baseline_train = train.merge(df_avg_time_city, how='left', on='city')
    df_baseline_test = test.merge(df_avg_time_city, how='left', on='city')

    mean_baseline = df_baseline_train['baseline_time_taken_min'].mean()
    df_baseline_train['baseline_time_taken_min'] = df_baseline_train['baseline_time_taken_min'].fillna(mean_baseline)
    df_baseline_test['baseline_time_taken_min'] = df_baseline_test['baseline_time_taken_min'].fillna(mean_baseline)

        
    return df_baseline_train[['id', 'baseline_time_taken_min']], df_baseline_test[['id', 'baseline_time_taken_min']]

In [29]:
class BaselineModel:
    def __init__(self):
        self.avg_time_city = None
        self.overall_mean = None
    
    def fit(self, X: pd.DataFrame, y=None) -> 'BaselineModel':
        self.avg_time_city = X.groupby('city', as_index=False)[['time_taken_min']].mean().rename(columns={'time_taken_min': 'baseline_time_taken_min'})
        self.overall_mean = X['time_taken_min'].mean()

        return self

    def predict(self, X: pd.DataFrame) -> pd.DataFrame:
        result = X.merge(self.avg_time_city, how='left', on='city')
        result['baseline_predictions'] = result['baseline_time_taken_min'].fillna(self.overall_mean)

        return result[['id', 'baseline_predictions']]

In [30]:
baseline_model = BaselineModel()
baseline_model.fit(df_train)
baseline_predictions = baseline_model.predict(df_test)
baseline_predictions

Unnamed: 0,id,baseline_predictions
0,0xbd01,27.293102
1,0x5980,27.293102
2,0x6ff3,23.009169
3,0x4a6f,23.009169
4,0x4f5d,23.009169
...,...,...
9114,0x1396,23.009169
9115,0x4ca,23.009169
9116,0x4fe,27.293102
9117,0x576,27.293102


In [31]:
class GradientBoostedDecisionTree:
    def __init__(self, params=None):
        self.params = params or {'objective': 'regression', 'metric': 'rmse'}
        self.model = None

    def fit(self, X: pd.DataFrame, y: pd.DataFrame) -> 'GradientBoostedDecisionTree':
        lgbm_train = lgb.Dataset(X, y)
        self.model = lgb.train(self.params,lgbm_train)
        return self

    def predict(self, X: pd.DataFrame, df_id: pd.DataFrame) -> pd.DataFrame:
        predictions = df_id[['id']].copy()
        predictions['GradientBoostedDecisionTree_predictions'] = self.model.predict(X)

        return predictions

In [32]:
decision_tree_model = GradientBoostedDecisionTree()
decision_tree_model.fit(X=df_train_x, y=df_train_y)
model_predictions = decision_tree_model.predict(X=df_test_x, df_id=df_test_id)
model_predictions

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001854 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 872
[LightGBM] [Info] Number of data points in the train set: 36474, number of used features: 32
[LightGBM] [Info] Start training from score 26.273428


Unnamed: 0,id,GradientBoostedDecisionTree_predictions
0,0xbd01,41.421890
1,0x5980,31.827336
2,0x6ff3,20.896368
3,0x4a6f,32.493085
4,0x4f5d,24.139227
...,...,...
9114,0x1396,28.460540
9115,0x4ca,36.221039
9116,0x4fe,24.124119
9117,0x576,23.910267


##### Gradient Boosted Decision Tree

In [33]:
# def objective(trial):
#     # Cross validation splits
#     kf = KFold(n_splits=2, shuffle=True, random_state=42)
    
#     # Parameters
#     params = {
#         'objective': 'regression',
#         'metric': 'rmse',
#         'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
#         'num_leaves': trial.suggest_int('num_leaves', 2, 256),
#         'max_depth': trial.suggest_int('max_depth', 2, 32),
#         'min_child_samples': trial.suggest_int('min_child_samples', 20, 100),
#         'subsample': trial.suggest_float('subsample', 0.6, 1.0),
#         'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
#         'verbose': -1
#     }
    
#     rmse_scores = []
    
#     # K-fold cross validation
#     for train_idx, valid_idx in kf.split(df_train_x):
#         X_tr, X_val = df_train_x.iloc[train_idx], df_train_x.iloc[valid_idx]
#         y_tr, y_val = df_train_y.iloc[train_idx], df_train_y.iloc[valid_idx]
        
#         lgb_train = lgb.Dataset(X_tr, y_tr)
#         lgb_valid = lgb.Dataset(X_val, y_val, reference=lgb_train)
        
#         model = lgb.train(
#             params,
#             lgb_train,
#             valid_sets=[lgb_valid],
#             callbacks=[lgb.early_stopping(10, verbose=False)],
#         )
        
#         y_pred = model.predict(X_val, num_iteration=model.best_iteration)
#         rmse = root_mean_squared_error(y_val, y_pred)
#         rmse_scores.append(rmse)
    
#     return np.mean(rmse_scores)

In [34]:
# study = optuna.create_study(direction='minimize')  # Minimize RMSE
# study.optimize(objective, n_trials=5)

# # After study.optimize()
# best_params = study.best_params
# best_params.update({
#     'objective': 'regression',
#     'metric': 'rmse',
#     'verbose': -1
# })

# best_params

In [None]:
def hyperparameter_tuning(X, y, n_splits: int, n_trials: int, params: dict, model) -> dict:
    """
    Performs hyperparameter tuning using Optuna with K-fold cross-validation.
    
    Args:
        n_splits: Number of K-fold splits
        n_trials: Number of Optuna trials
        params: Dictionary with 'search_space' and 'fixed_params'
        model: Model type ('lgb' supported)
        X: Training features
        y: Training target
        
    Returns:
        dict: Best parameters found
    """
    
    def objective(trial):
        kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
        
        # Build params from search space
        trial_params = params.get('fixed_params', {}).copy()
        for param_name, param_config in params['search_space'].items():
            if param_config['type'] == 'int':
                trial_params[param_name] = trial.suggest_int(
                    param_name, param_config['low'], param_config['high']
                )
            elif param_config['type'] == 'float':
                trial_params[param_name] = trial.suggest_float(
                    param_name, param_config['low'], param_config['high']
                )
        
        rmse_scores = []
        
        for train_idx, valid_idx in kf.split(X):
            X_tr, X_val = X.iloc[train_idx], X.iloc[valid_idx]
            y_tr, y_val = y.iloc[train_idx], y.iloc[valid_idx]
            
            lgb_train = lgb.Dataset(X_tr, y_tr)
            lgb_valid = lgb.Dataset(X_val, y_val, reference=lgb_train)
            
            trained_model = lgb.train(
                trial_params,
                lgb_train,
                valid_sets=[lgb_valid],
                callbacks=[lgb.early_stopping(10, verbose=False)],
            )
            
            y_pred = trained_model.predict(X_val, num_iteration=trained_model.best_iteration)
            rmse = root_mean_squared_error(y_val, y_pred)
            rmse_scores.append(rmse)
        
        return np.mean(rmse_scores)
    
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=n_trials)
    
    best_params = study.best_params
    best_params.update(params.get('fixed_params', {}))
    
    return best_params

In [44]:
decision_tree_model = GradientBoostedDecisionTree()
params = {
    'fixed_params': {
        'objective': 'regression',
        'metric': 'rmse',
        'verbose': -1
    },
    'search_space': {
        'n_estimators': {'type': 'int', 'low': 50, 'high': 1000},
        'num_leaves': {'type': 'int', 'low': 2, 'high': 256},
        'max_depth': {'type': 'int', 'low': 2, 'high': 32},
        'min_child_samples': {'type': 'int', 'low': 20, 'high': 100},
        'subsample': {'type': 'float', 'low': 0.6, 'high': 1.0},
        'colsample_bytree': {'type': 'float', 'low': 0.6, 'high': 1.0}
    }
}

best_params = hyperparameter_tuning(df_train_x, df_train_y, 2, 5, params, decision_tree_model)
best_params

[I 2025-12-04 12:02:29,768] A new study created in memory with name: no-name-78551989-ba64-45fc-b771-a17aa82d6b66
[I 2025-12-04 12:02:32,657] Trial 0 finished with value: 4.219051411964841 and parameters: {'n_estimators': 91, 'num_leaves': 98, 'max_depth': 18, 'min_child_samples': 56, 'subsample': 0.9926257915652115, 'colsample_bytree': 0.8070896531447753}. Best is trial 0 with value: 4.219051411964841.
[I 2025-12-04 12:02:34,513] Trial 1 finished with value: 4.2685400651282555 and parameters: {'n_estimators': 578, 'num_leaves': 55, 'max_depth': 29, 'min_child_samples': 72, 'subsample': 0.8547110538631251, 'colsample_bytree': 0.6557961578975232}. Best is trial 0 with value: 4.219051411964841.
[I 2025-12-04 12:02:38,720] Trial 2 finished with value: 4.2350473591592195 and parameters: {'n_estimators': 622, 'num_leaves': 204, 'max_depth': 11, 'min_child_samples': 27, 'subsample': 0.986748233528954, 'colsample_bytree': 0.7065897685964646}. Best is trial 0 with value: 4.219051411964841.
[I 

{'n_estimators': 91,
 'num_leaves': 98,
 'max_depth': 18,
 'min_child_samples': 56,
 'subsample': 0.9926257915652115,
 'colsample_bytree': 0.8070896531447753,
 'objective': 'regression',
 'metric': 'rmse',
 'verbose': -1}

In [45]:
decision_tree_model = GradientBoostedDecisionTree(params=best_params)
decision_tree_model.fit(X=df_train_x, y=df_train_y)
model_predictions = decision_tree_model.predict(X=df_test_x, df_id=df_test_id)
model_predictions

Unnamed: 0,id,GradientBoostedDecisionTree_predictions
0,0xbd01,41.568961
1,0x5980,32.211820
2,0x6ff3,21.315308
3,0x4a6f,31.278780
4,0x4f5d,24.017522
...,...,...
9114,0x1396,27.079931
9115,0x4ca,36.483931
9116,0x4fe,25.272423
9117,0x576,23.292011


# Evaluate

In [None]:
df_feature_importance = pd.DataFrame({
    'features': decision_tree_model.model.feature_name(),
    'importance': decision_tree_model.model.feature_importance()
})
df_feature_importance.sort_values('importance', ascending=False).plot(kind='bar', x='features', y='importance')

In [None]:
df_actual = df_test_id.merge(df_test_y, how='left', left_index=True, right_index=True)
df_actual

In [None]:
df_evaluate = pd.merge(df_actual, baseline_predictions, how='left', on='id').merge(model_predictions, how='left', on='id')
df_evaluate

In [None]:
df_evaluate['abs_model_error'] = np.abs(df_evaluate['time_taken_min'] - df_evaluate['GradientBoostedDecisionTree_predictions'])
df_evaluate['abs_baseline_error'] = np.abs(df_evaluate['time_taken_min'] - df_evaluate['baseline_predictions'])

df_evaluate['model_error'] = (df_evaluate['time_taken_min'] - df_evaluate['GradientBoostedDecisionTree_predictions'])
df_evaluate['baseline_error'] = (df_evaluate['time_taken_min'] - df_evaluate['baseline_predictions'])

df_evaluate

In [None]:
baseline_rmse = np.sqrt(np.mean(df_evaluate['abs_baseline_error'] ** 2))
model_rmse = np.sqrt(np.mean(df_evaluate['abs_model_error'] ** 2))

print(f'Model RMSE: {model_rmse}')
print(f'Baseline RMSE: {baseline_rmse}')
