In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from xgboost import plot_importance

In [7]:
df = pd.read_csv('../../data/preprocessed/main-data.csv') 

# convert day to numerical

le = LabelEncoder()
df['day'] = le.fit_transform(df['day'])
df.drop(['category_names','packet_date'], axis=1, inplace=True)

df['is_vip'] = df['is_vip'].astype(int)

# Define features and target variable
X = df[df.columns.difference(['collection_duration'])]
y = df['collection_duration']

In [8]:
def XGBOOST_Evaluate(X_train, X_test, y_train, y_test):
    # Define hyperparameter grid for XGBoost
    parameters = {
        'objective': ['reg:squarederror'],
        'booster': ['gbtree'],
        'learning_rate': [0.1, 0.2, 0.3],
        'max_depth': [7, 10, 15, 20, 25, 30],
        'min_child_weight': [10, 15, 20, 25],
        'colsample_bytree': [0.8, 0.9, 1],
        'n_estimators': [100, 200, 300, 400, 500, 600],
        "reg_alpha": [0.5, 0.2, 1],
        "reg_lambda": [2, 3, 5],
        "gamma": [1, 2, 3]
    }

    # Initialize XGBoost model
    xgb_model = XGBRegressor(random_state=42)

    # Initialize RandomizedSearchCV
    grid_xgb = RandomizedSearchCV(xgb_model, parameters, cv=10, n_iter=10, scoring='neg_mean_absolute_error', verbose=0, n_jobs= -1)

    # Fit RandomizedSearchCV
    grid_xgb.fit(X_train, y_train)

    # Extract best estimator
    best_xgb_model = grid_xgb.best_estimator_

    # Train final XGBoost model with best hyperparameters
    best_xgb_model.fit(X_train, y_train, eval_metric=["rmse"], eval_set=[(X_train, y_train), (X_test, y_test)])

    # Make predictions on the test set
    y_pred = best_xgb_model.predict(X_test)

    # Evaluate predictions
    rmse = root_mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rsquared = r2_score(y_test, y_pred)
    mse = np.mean((y_test - y_pred) ** 2)
    mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

    print("R-squared: %.4f" % rsquared)
    print("RMSE: %.4f" % rmse)
    print("MAE : %.4f" % mae)
    print("MSE : %.4f" % mse)
    print("MAPE: %.4f" % mape)

    p_accuracy = [(1 - np.abs(pred - actual) / actual) * 100 for pred, actual in zip(y_pred, y_test)]

    # Print P_accuracy
    print("P_accuracy: Mean: %.2f%%, Min: %.2f%%, Max: %.2f%%" % (np.mean(p_accuracy), np.min(p_accuracy), np.max(p_accuracy)))
    
    # print best hyperparameters
    results = best_xgb_model.evals_result()
    
    print("Best hyperparameters:", grid_xgb.best_params_)
    print('--------------------------------------------------')


    return results, y_test, y_pred, rmse, mae, rsquared, p_accuracy

In [9]:
# Defining the features to be used for training
feature_combinations = [
    df.columns.difference(['collection_duration', 'day', 'year', 'month', 'hour']),
    df.columns.difference(['collection_duration', 'day', 'year', 'month', 'hour', 'unique_category_count']),
    df.columns.difference(['collection_duration', 'day', 'year', 'month', 'hour', 'unique_category_count', 'average_collection_time']),
    df.columns.difference(['collection_duration', 'day', 'year', 'month', 'hour', 'average_collection_time']),
    df.columns.difference(['collection_duration', 'unique_category_count', 'average_collection_time']),
    df.columns.difference(['collection_duration', 'unique_category_count']),
    df.columns.difference(['collection_duration', 'average_collection_time']),
    df.columns.difference(['collection_duration']),
]

In [10]:
for features in feature_combinations:
    X = df[features]
    y = df['collection_duration']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
    print(f"Training and evaluating model for features: {features}")
    results, y_test, y_pred, rmse, mae, rsquared, p_accuracy = XGBOOST_Evaluate(X_train, X_test, y_train, y_test)
    print("R-squared: %.4f" % rsquared)
    print("RMSE: %.4f" % rmse)
    print("MAE : %.4f" % mae)
    print("P_accuracy: Mean: %.2f%%, Min: %.2f%%, Max: %.2f%%" % (np.mean(p_accuracy), np.min(p_accuracy), np.max(p_accuracy)))


Training and evaluating model for features: Index(['Baby', 'Bakery', 'Cleaning&Household_Supplies', 'Clothing&Accessory',
       'ConvenienceFood&Appetizers', 'Drinks', 'Frozen_food',
       'Fruits&Vegetables', 'Health', 'Healthy_Living', 'Hobby&Game',
       'Home_Care', 'Icecream', 'Meat&Fish', 'Milk&Breakfast', 'Others',
       'Personal_Care&Cosmetics', 'Pet', 'Snacks', 'Special_Days',
       'Staple_Food', 'Technology', 'average_collection_time', 'is_holiday',
       'is_vip', 'item_count', 'unique_category_count'],
      dtype='object')
[0]	validation_0-rmse:2.51908	validation_1-rmse:2.52167
[1]	validation_0-rmse:2.43917	validation_1-rmse:2.44185




[2]	validation_0-rmse:2.37245	validation_1-rmse:2.37525
[3]	validation_0-rmse:2.31692	validation_1-rmse:2.31989
[4]	validation_0-rmse:2.27092	validation_1-rmse:2.27405
[5]	validation_0-rmse:2.23293	validation_1-rmse:2.23625
[6]	validation_0-rmse:2.20164	validation_1-rmse:2.20518
[7]	validation_0-rmse:2.17591	validation_1-rmse:2.17964
[8]	validation_0-rmse:2.15481	validation_1-rmse:2.15874
[9]	validation_0-rmse:2.13750	validation_1-rmse:2.14163
[10]	validation_0-rmse:2.12338	validation_1-rmse:2.12767
[11]	validation_0-rmse:2.11452	validation_1-rmse:2.11915
[12]	validation_0-rmse:2.10601	validation_1-rmse:2.11086
[13]	validation_0-rmse:2.09996	validation_1-rmse:2.10512
[14]	validation_0-rmse:2.09258	validation_1-rmse:2.09793
[15]	validation_0-rmse:2.08659	validation_1-rmse:2.09211
[16]	validation_0-rmse:2.08164	validation_1-rmse:2.08736
[17]	validation_0-rmse:2.07762	validation_1-rmse:2.08353
[18]	validation_0-rmse:2.07432	validation_1-rmse:2.08037
[19]	validation_0-rmse:2.07161	validati

KeyboardInterrupt: 