In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
import numpy as np

In [12]:
data = pd.read_csv(r"..\Data\NEW_ADS3.csv")

In [13]:
data = data.drop(['Unnamed: 0', 'finalspend_usd', "effectiveness_usd", "starttime"], axis = 1)

In [14]:
data=data[['dag_categ_INDUSTRIAL CAPEX',
       'dag_categ_INDUSTRIAL CAPEX - GLOBAL - ABOVE 500K',
       'dag_categ_INDUSTRIAL CAPEX - LOCAL',
       'dag_categ_Indirect Trade Marketing - Distributors and Associations',
       'dag_categ_Office Supplies', 'dag_categ_Other Office Utilities',
       'dag_categ_Water, Electricity & Gas (Office)', 'categ_l1_COMMERCIAL',
       'categ_l1_PACKAGING', 'categ_l2_IND CAPEX', 'categ_l2_MARKETING',
       'categ_l2_OFFICE UTILITIES', 'categ_l2_PACKAGING', 'categ_l2_POCM',
       'categ_l2_SALES','baselinespend_usd','bestbid_usd','participant','itemno','biddingperiod','timebetweenlotclosing','setareviewperiodafterlotcloses_Yes','zone_APAC','zone_EUR','zone_MAZ','zone_NAZ','zone_SAZ','enabletrafficlightbidding_Yes', "data", "event_id", "efficiency"]]

In [15]:
train = data[data["data"]== 'train']
forecast = data[data['data'] == 'submission_df']

In [16]:
train_final, test_final = train.iloc[:10000], train.iloc[10000:]

In [17]:
train_final

Unnamed: 0,dag_categ_INDUSTRIAL CAPEX,dag_categ_INDUSTRIAL CAPEX - GLOBAL - ABOVE 500K,dag_categ_INDUSTRIAL CAPEX - LOCAL,dag_categ_Indirect Trade Marketing - Distributors and Associations,dag_categ_Office Supplies,dag_categ_Other Office Utilities,"dag_categ_Water, Electricity & Gas (Office)",categ_l1_COMMERCIAL,categ_l1_PACKAGING,categ_l2_IND CAPEX,...,setareviewperiodafterlotcloses_Yes,zone_APAC,zone_EUR,zone_MAZ,zone_NAZ,zone_SAZ,enabletrafficlightbidding_Yes,data,event_id,efficiency
0,0,0,0,0,0,0,0,1,0,0,...,1,1,0,0,0,0,1,train,FULA,0.312998
1,0,0,1,0,0,0,0,0,0,1,...,1,0,0,1,0,0,1,train,GAEQ,0.019504
2,0,0,1,0,0,0,0,0,0,1,...,1,0,0,0,0,1,1,train,AJUL,0.000000
3,0,0,1,0,0,0,0,0,0,1,...,1,0,0,1,0,0,1,train,CJFZ,0.000000
4,0,0,1,0,0,0,0,0,0,1,...,1,0,0,1,0,0,1,train,BMRW,0.039700
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,1,train,DCBO,0.482963
9996,0,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,1,train,EZGS,0.482518
9997,0,0,1,0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,train,EKYB,0.479010
9998,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,train,MYR,0.476593


In [8]:
train.columns

Index(['dag_categ_INDUSTRIAL CAPEX',
       'dag_categ_INDUSTRIAL CAPEX - GLOBAL - ABOVE 500K',
       'dag_categ_INDUSTRIAL CAPEX - LOCAL',
       'dag_categ_Indirect Trade Marketing - Distributors and Associations',
       'dag_categ_Office Supplies', 'dag_categ_Other Office Utilities',
       'dag_categ_Water, Electricity & Gas (Office)', 'categ_l1_COMMERCIAL',
       'categ_l1_PACKAGING', 'categ_l2_IND CAPEX', 'categ_l2_MARKETING',
       'categ_l2_OFFICE UTILITIES', 'categ_l2_PACKAGING', 'categ_l2_POCM',
       'categ_l2_SALES', 'baselinespend_usd', 'bestbid_usd', 'participant',
       'itemno', 'biddingperiod', 'timebetweenlotclosing',
       'setareviewperiodafterlotcloses_Yes', 'zone_APAC', 'zone_EUR',
       'zone_MAZ', 'zone_NAZ', 'zone_SAZ', 'enabletrafficlightbidding_Yes',
       'data', 'event_id', 'efficiency'],
      dtype='object')

# Hyperparameter selection using Optuna

In [10]:
import optuna
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

def smape(y_true, y_pred):
    epsilon = 1e-7  # Avoid division by zero
    return 100 * np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_pred) + np.abs(y_true) + epsilon))

def catboost_model(target, train, train_full, test, forecast):
    model_name = "CatBoost"
    print(f"Running {model_name}")
    
    x_train = train.drop(target, axis=1)
    y_train = train[target]
    x_train_full = train_full.drop(target, axis=1)
    y_train_full = train_full[target]
    x_test = test.drop(target, axis=1)
    x_forecast = forecast.drop(target, axis=1)

    # Define the objective function for Optuna
    def objective(trial):
        # Hyperparameters to tune
        params = {
            "iterations": trial.suggest_int("iterations", 1000, 1000),
            "depth": trial.suggest_int("depth", 12, 14),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
            "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-3, 10),
            "bagging_temperature": trial.suggest_float("bagging_temperature", 0, 1),
            "border_count": trial.suggest_int("border_count", 32, 255),
            "random_seed": 123,
            "loss_function": "MAE",
            # "custom_metric":["RMSE", smape],
            "verbose": 0
        }
        model = CatBoostRegressor(**params)
        model.fit(x_train_full, y_train_full)
        preds = model.predict(x_train)
        mae_score = mean_absolute_error(y_train[:len(preds)], preds)
        smape_score = smape(y_train[:len(preds)], preds)
        combined_score = 0.8 * mae_score + 0.2 * smape_score
        return combined_score

    # Optimize hyperparameters using Optuna
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=50)

    # Best parameters
    best_params = study.best_params
    print(f"Best parameters: {best_params}")

    # Train and predict with best parameters
    model = CatBoostRegressor(**best_params)
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)

    # Train on the full dataset and forecast
    model.fit(x_train_full, y_train_full)
    y_forecast = model.predict(x_forecast)

    print(f"Completed {model_name} model")
    return y_pred, y_forecast


In [11]:
output = {}

In [12]:
y_pred,y_forecast = catboost_model("efficiency", train_final.drop(['data', 'event_id'],axis = 1), train.drop(['data', 'event_id'],axis = 1),test_final.drop(['data', 'event_id'],axis = 1), forecast.drop(['data', 'event_id'], axis = 1))

[I 2024-11-29 03:15:49,337] A new study created in memory with name: no-name-ba82ced4-c37c-4386-90a9-60dcd982fa0f


Running CatBoost


[I 2024-11-29 03:17:55,347] Trial 0 finished with value: 7.139004621335823 and parameters: {'iterations': 1000, 'depth': 14, 'learning_rate': 0.2821047901420542, 'l2_leaf_reg': 6.544742001368823, 'bagging_temperature': 0.0744884708917053, 'border_count': 112}. Best is trial 0 with value: 7.139004621335823.
[I 2024-11-29 03:19:54,055] Trial 1 finished with value: 8.824104185958925 and parameters: {'iterations': 1000, 'depth': 13, 'learning_rate': 0.13756927969254584, 'l2_leaf_reg': 7.406247807156991, 'bagging_temperature': 0.38892603109221124, 'border_count': 238}. Best is trial 0 with value: 7.139004621335823.
[I 2024-11-29 03:21:54,905] Trial 2 finished with value: 7.726436502944267 and parameters: {'iterations': 1000, 'depth': 14, 'learning_rate': 0.21803275439568665, 'l2_leaf_reg': 3.2171821840460155, 'bagging_temperature': 0.022413996974697814, 'border_count': 84}. Best is trial 0 with value: 7.139004621335823.
[I 2024-11-29 03:22:32,275] Trial 3 finished with value: 9.157534802395

Best parameters: {'iterations': 1000, 'depth': 14, 'learning_rate': 0.2885044437918236, 'l2_leaf_reg': 9.049888644428659, 'bagging_temperature': 0.3466650286110847, 'border_count': 173}
0:	learn: 0.1251210	total: 308ms	remaining: 5m 8s
1:	learn: 0.1231282	total: 543ms	remaining: 4m 30s
2:	learn: 0.1216923	total: 765ms	remaining: 4m 14s
3:	learn: 0.1200372	total: 995ms	remaining: 4m 7s
4:	learn: 0.1184145	total: 1.23s	remaining: 4m 3s
5:	learn: 0.1178229	total: 1.44s	remaining: 3m 59s
6:	learn: 0.1170674	total: 1.7s	remaining: 4m 1s
7:	learn: 0.1158793	total: 1.93s	remaining: 3m 58s
8:	learn: 0.1151236	total: 2.14s	remaining: 3m 55s
9:	learn: 0.1144275	total: 2.37s	remaining: 3m 54s
10:	learn: 0.1140972	total: 2.57s	remaining: 3m 51s
11:	learn: 0.1133549	total: 2.78s	remaining: 3m 49s
12:	learn: 0.1127652	total: 2.97s	remaining: 3m 45s
13:	learn: 0.1123279	total: 3.19s	remaining: 3m 44s
14:	learn: 0.1117903	total: 3.38s	remaining: 3m 41s
15:	learn: 0.1112292	total: 3.59s	remaining: 3m 4

# Best Parameters

In [None]:
Best_parameters = {
    'iterations': 1000,
    'depth': 14,
    'learning_rate': 0.2885044437918236,
    'l2_leaf_reg': 9.049888644428659,
    'bagging_temperature': 0.3466650286110847,
    'border_count': 173}