In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [2]:
df = pd.read_csv('../../data/preprocessed/main-data.csv') 

le = LabelEncoder()
df['day'] = le.fit_transform(df['day'])

df.drop(['category_names','packet_date'], axis=1, inplace=True)

df['is_vip'] = df['is_vip'].astype(int)

# Define features and target variable
X = df[df.columns.difference(['collection_duration'])]
y = df['collection_duration']

In [3]:
# Function to evaluate Random Forest model
def RandomForest_evaluate(X_train, X_test, y_train, y_test):
    # Define hyperparameter grid
    parameters = {
        'n_estimators': [100, 200, 300],
        'max_depth': [5, 7, 10, 15, 20, 25, 30],
        'min_samples_split': [2, 3, 5],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2'],
    }

    # Initialize the model
    model = RandomForestRegressor(random_state=42)

    # Randomized search for hyperparameter tuning
    grid_model = RandomizedSearchCV(model, parameters, n_iter=10, scoring='neg_mean_absolute_error', n_jobs=-1, cv=5, verbose=0)
    grid_model.fit(X_train, y_train)

    # Get the best estimator
    best_model = grid_model.best_estimator_

    # Make predictions
    predictions = best_model.predict(X_test)

    # Evaluate predictions
    rmse = root_mean_squared_error(y_test, predictions)
    mae = mean_absolute_error(y_test, predictions)
    rsquared = r2_score(y_test, predictions)
    p_accuracy = [(1 - np.abs(pred - actual) / actual) * 100 for pred, actual in zip(predictions, y_test)]

    return rmse, mae, rsquared, p_accuracy, best_model, grid_model.best_params_

In [4]:
# Defining the features to be used for training
feature_combinations = [
    df.columns.difference(['collection_duration', 'day', 'year', 'month', 'hour']),
    df.columns.difference(['collection_duration', 'day', 'year', 'month', 'hour', 'unique_category_count']),
    df.columns.difference(['collection_duration', 'day', 'year', 'month', 'hour', 'unique_category_count', 'average_collection_time']),
    df.columns.difference(['collection_duration', 'day', 'year', 'month', 'hour', 'average_collection_time']),
    df.columns.difference(['collection_duration', 'unique_category_count', 'average_collection_time']),
    df.columns.difference(['collection_duration', 'unique_category_count']),
    df.columns.difference(['collection_duration', 'average_collection_time']),
    df.columns.difference(['collection_duration']),
]

In [5]:
# Train and evaluate model for each feature combination
for features in feature_combinations:
    # Define features and target variable
    X = df[features]
    y = df['collection_duration']
    
    # Split df into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

    # Evaluate Random Forest model
    rmse, mae, rsquared, p_accuracy, best_model, best_params = RandomForest_evaluate(X_train, X_test, y_train, y_test)

    print("Feature Combination:", features)
    print("R-squared: %.4f" % rsquared)
    print("RMSE: %.4f" % rmse)
    print("MAE : %.4f" % mae)
    # Print P_accuracy
    print("P_accuracy: Mean: %.2f%%, Min: %.2f%%, Max: %.2f%%" % (np.mean(p_accuracy), np.min(p_accuracy), np.max(p_accuracy)))
    print("Best parameters:", best_params)
    print("\n")



Feature Combination: Index(['Baby', 'Bakery', 'Cleaning&Household_Supplies', 'Clothing&Accessory',
       'ConvenienceFood&Appetizers', 'Drinks', 'Frozen_food',
       'Fruits&Vegetables', 'Health', 'Healthy_Living', 'Hobby&Game',
       'Home_Care', 'Icecream', 'Meat&Fish', 'Milk&Breakfast', 'Others',
       'Personal_Care&Cosmetics', 'Pet', 'Snacks', 'Special_Days',
       'Staple_Food', 'Technology', 'average_collection_time', 'is_holiday',
       'is_vip', 'item_count', 'unique_category_count'],
      dtype='object')
R-squared: 0.3405
RMSE: 2.0200
MAE : 1.5948
P_accuracy: Mean: 68.95%, Min: -58.35%, Max: 100.00%
Best parameters: {'n_estimators': 300, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 20}






Feature Combination: Index(['Baby', 'Bakery', 'Cleaning&Household_Supplies', 'Clothing&Accessory',
       'ConvenienceFood&Appetizers', 'Drinks', 'Frozen_food',
       'Fruits&Vegetables', 'Health', 'Healthy_Living', 'Hobby&Game',
       'Home_Care', 'Icecream', 'Meat&Fish', 'Milk&Breakfast', 'Others',
       'Personal_Care&Cosmetics', 'Pet', 'Snacks', 'Special_Days',
       'Staple_Food', 'Technology', 'average_collection_time', 'is_holiday',
       'is_vip', 'item_count'],
      dtype='object')
R-squared: 0.3392
RMSE: 2.0220
MAE : 1.5959
P_accuracy: Mean: 68.93%, Min: -67.44%, Max: 100.00%
Best parameters: {'n_estimators': 300, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 25}






Feature Combination: Index(['Baby', 'Bakery', 'Cleaning&Household_Supplies', 'Clothing&Accessory',
       'ConvenienceFood&Appetizers', 'Drinks', 'Frozen_food',
       'Fruits&Vegetables', 'Health', 'Healthy_Living', 'Hobby&Game',
       'Home_Care', 'Icecream', 'Meat&Fish', 'Milk&Breakfast', 'Others',
       'Personal_Care&Cosmetics', 'Pet', 'Snacks', 'Special_Days',
       'Staple_Food', 'Technology', 'is_holiday', 'is_vip', 'item_count'],
      dtype='object')
R-squared: 0.3404
RMSE: 2.0201
MAE : 1.5952
P_accuracy: Mean: 68.86%, Min: -66.71%, Max: 100.00%
Best parameters: {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 15}






Feature Combination: Index(['Baby', 'Bakery', 'Cleaning&Household_Supplies', 'Clothing&Accessory',
       'ConvenienceFood&Appetizers', 'Drinks', 'Frozen_food',
       'Fruits&Vegetables', 'Health', 'Healthy_Living', 'Hobby&Game',
       'Home_Care', 'Icecream', 'Meat&Fish', 'Milk&Breakfast', 'Others',
       'Personal_Care&Cosmetics', 'Pet', 'Snacks', 'Special_Days',
       'Staple_Food', 'Technology', 'is_holiday', 'is_vip', 'item_count',
       'unique_category_count'],
      dtype='object')
R-squared: 0.3397
RMSE: 2.0212
MAE : 1.5952
P_accuracy: Mean: 68.94%, Min: -63.80%, Max: 100.00%
Best parameters: {'n_estimators': 300, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 20}






Feature Combination: Index(['Baby', 'Bakery', 'Cleaning&Household_Supplies', 'Clothing&Accessory',
       'ConvenienceFood&Appetizers', 'Drinks', 'Frozen_food',
       'Fruits&Vegetables', 'Health', 'Healthy_Living', 'Hobby&Game',
       'Home_Care', 'Icecream', 'Meat&Fish', 'Milk&Breakfast', 'Others',
       'Personal_Care&Cosmetics', 'Pet', 'Snacks', 'Special_Days',
       'Staple_Food', 'Technology', 'day', 'hour', 'is_holiday', 'is_vip',
       'item_count', 'month', 'year'],
      dtype='object')
R-squared: 0.3448
RMSE: 2.0134
MAE : 1.5887
P_accuracy: Mean: 69.05%, Min: -85.46%, Max: 100.00%
Best parameters: {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 20}






Feature Combination: Index(['Baby', 'Bakery', 'Cleaning&Household_Supplies', 'Clothing&Accessory',
       'ConvenienceFood&Appetizers', 'Drinks', 'Frozen_food',
       'Fruits&Vegetables', 'Health', 'Healthy_Living', 'Hobby&Game',
       'Home_Care', 'Icecream', 'Meat&Fish', 'Milk&Breakfast', 'Others',
       'Personal_Care&Cosmetics', 'Pet', 'Snacks', 'Special_Days',
       'Staple_Food', 'Technology', 'average_collection_time', 'day', 'hour',
       'is_holiday', 'is_vip', 'item_count', 'month', 'year'],
      dtype='object')
R-squared: 0.3451
RMSE: 2.0129
MAE : 1.5891
P_accuracy: Mean: 69.02%, Min: -75.09%, Max: 100.00%
Best parameters: {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 20}






Feature Combination: Index(['Baby', 'Bakery', 'Cleaning&Household_Supplies', 'Clothing&Accessory',
       'ConvenienceFood&Appetizers', 'Drinks', 'Frozen_food',
       'Fruits&Vegetables', 'Health', 'Healthy_Living', 'Hobby&Game',
       'Home_Care', 'Icecream', 'Meat&Fish', 'Milk&Breakfast', 'Others',
       'Personal_Care&Cosmetics', 'Pet', 'Snacks', 'Special_Days',
       'Staple_Food', 'Technology', 'day', 'hour', 'is_holiday', 'is_vip',
       'item_count', 'month', 'unique_category_count', 'year'],
      dtype='object')
R-squared: 0.3456
RMSE: 2.0123
MAE : 1.5882
P_accuracy: Mean: 69.07%, Min: -81.13%, Max: 100.00%
Best parameters: {'n_estimators': 300, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 20}






Feature Combination: Index(['Baby', 'Bakery', 'Cleaning&Household_Supplies', 'Clothing&Accessory',
       'ConvenienceFood&Appetizers', 'Drinks', 'Frozen_food',
       'Fruits&Vegetables', 'Health', 'Healthy_Living', 'Hobby&Game',
       'Home_Care', 'Icecream', 'Meat&Fish', 'Milk&Breakfast', 'Others',
       'Personal_Care&Cosmetics', 'Pet', 'Snacks', 'Special_Days',
       'Staple_Food', 'Technology', 'average_collection_time', 'day', 'hour',
       'is_holiday', 'is_vip', 'item_count', 'month', 'unique_category_count',
       'year'],
      dtype='object')
R-squared: 0.3449
RMSE: 2.0133
MAE : 1.5892
P_accuracy: Mean: 69.05%, Min: -67.36%, Max: 100.00%
Best parameters: {'n_estimators': 100, 'min_samples_split': 3, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 20}


