In [1]:
import pandas as pd
import numpy as np

In [2]:
df_train = pd.read_csv("../dataset/train_cleaned_outliers_imputed.csv")
df_train_id = df_train.pop("Id")
df_train_target = df_train.pop("CO2 Emissions(g/km)")

df_test = pd.read_csv("../dataset/test_cleaned_outliers_imputed.csv")
df_test_id = df_test.pop("Id")

In [3]:
df_train.info()

onehot_columns = ["Make", "Fuel Type", "Transmission_Type", "Vehicle Class General", "Gears", "is_outlier"]
binary_columns = ["Vehicle Class", "Transmission"]
ordinal_columns = ["Vehicle Type"]

numerical_columns = df_train.select_dtypes(include='float64').columns.tolist()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54937 entries, 0 to 54936
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Make                   54937 non-null  object 
 1   Vehicle Class          54937 non-null  object 
 2   Engine Size(L)         54937 non-null  float64
 3   Cylinders              54937 non-null  float64
 4   Transmission           54937 non-null  object 
 5   Fuel Type              54937 non-null  object 
 6   Fuel Consumption City  54937 non-null  float64
 7   Fuel Consumption Hwy   54937 non-null  float64
 8   Fuel Consumption Comb  54937 non-null  float64
 9   Transmission_Type      54937 non-null  object 
 10  Gears                  54937 non-null  int64  
 11  Vehicle Class General  54937 non-null  object 
 12  Vehicle Type           54937 non-null  object 
 13  is_outlier             54937 non-null  int64  
dtypes: float64(5), int64(2), object(7)
memory usage: 5.9+ 

In [4]:
import functions as func

X_train = df_train.copy()
y_train = df_train_target.copy()

X_train, _ = func.encode_categorical_features(X_train, df_train.copy(), onehot_columns, binary_columns, ordinal_columns)
X_train, _ = func.standard_scale_dataframe(X_train, df_train.copy(), numerical_columns)

In [18]:
from bayes_opt import BayesianOptimization
import xgboost as xgb

# Load data
data = X_train.copy()
target = y_train.copy()


# Define the function to optimize
def xgb_evaluate(
    max_depth,
    gamma,
    colsample_bytree,
    subsample,
    eta,
    min_child_weight,
    max_delta_step,
    scale_pos_weight,
    reg_alpha,
    reg_lambda,
):
    params = {
        "eval_metric": "rmse",
        "max_depth": int(max_depth),
        "subsample": subsample,
        "eta": eta,
        "gamma": gamma,
        "min_child_weight": min_child_weight,
        "colsample_bytree": colsample_bytree,
        "max_delta_step": max_delta_step,
        "scale_pos_weight": scale_pos_weight,
        "reg_alpha": reg_alpha,
        "reg_lambda": reg_lambda,
    }
    cv_result = xgb.cv(params, dtrain, num_boost_round=1000, nfold=3, stratified=False)

    return -1.0 * cv_result["test-rmse-mean"].iloc[-1]


# Convert the data into a DMatrix
dtrain = xgb.DMatrix(data, label=target)

xgb_bo = BayesianOptimization(
    xgb_evaluate,
    {
        "max_depth": (1, 10),
        "gamma": (0, 1),
        "colsample_bytree": (0.3, 0.9),
        "subsample": (0.5, 1),
        "eta": (0.01, 0.3),
        "min_child_weight": (0.01, 0.3),
        "max_delta_step": (0, 10),
        "scale_pos_weight": (1, 100),
        "reg_alpha": (0, 1),
        "reg_lambda": (0, 1),
    },
)
xgb_bo.maximize(init_points=10, n_iter=100)

|   iter    |  target   | colsam... |    eta    |   gamma   | max_de... | max_depth | min_ch... | reg_alpha | reg_la... | scale_... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------------------
| [0m1        [0m | [0m-20.17   [0m | [0m0.4004   [0m | [0m0.2146   [0m | [0m0.08276  [0m | [0m1.543    [0m | [0m5.739    [0m | [0m0.2465   [0m | [0m0.5844   [0m | [0m0.9947   [0m | [0m6.425    [0m | [0m0.617    [0m |
| [0m2        [0m | [0m-20.73   [0m | [0m0.5798   [0m | [0m0.271    [0m | [0m0.1119   [0m | [0m9.286    [0m | [0m3.218    [0m | [0m0.02169  [0m | [0m0.5426   [0m | [0m0.2635   [0m | [0m22.64    [0m | [0m0.5007   [0m |
| [95m3        [0m | [95m-19.92   [0m | [95m0.3148   [0m | [95m0.07119  [0m | [95m0.3792   [0m | [95m9.698    [0m | [95m5.455    [0m | [95m0.254    [0m | [95m0.1083   [0m | [95m0.1787   [0m | [95m32.21  

In [19]:
xgb_bo.max

{'target': -19.313372947948267,
 'params': {'colsample_bytree': 0.3,
  'eta': 0.03508222219931691,
  'gamma': 0.7686173125574116,
  'max_delta_step': 0.0,
  'max_depth': 8.350757895869842,
  'min_child_weight': 0.3,
  'reg_alpha': 0.19008440126889958,
  'reg_lambda': 0.7610085486525633,
  'scale_pos_weight': 13.442385709119758,
  'subsample': 1.0}}

In [15]:
from bayes_opt import BayesianOptimization
import lightgbm as lgb
from sklearn.datasets import load_iris
from sklearn.metrics import mean_squared_error

# Load data
data = X_train.copy()
target = y_train.copy()


# Define the function to optimize
def lgbm_evaluate(
    max_depth,
    num_leaves,
    min_data_in_leaf,
    feature_fraction,
    bagging_fraction,
    learning_rate,
    lambda_l1,
    lambda_l2,
    min_gain_to_split,
):
    params = {
        "objective": "regression",
        "metric": "rmse",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "max_depth": int(max_depth),
        "num_leaves": int(num_leaves),
        "min_data_in_leaf": int(min_data_in_leaf),
        "feature_fraction": feature_fraction,
        "bagging_fraction": bagging_fraction,
        "bagging_freq": 1,
        "learning_rate": learning_rate,
        "lambda_l1": int(lambda_l1),
        "lambda_l2": int(lambda_l2),
        "min_gain_to_split": int(min_gain_to_split),
    }
    train_data = lgb.Dataset(data, label=target)
    cv_result = lgb.cv(
        params, train_data, num_boost_round=1000, nfold=3, stratified=False
    )

    return -1.0 * cv_result["valid rmse-mean"][-1]


# Define the Bayesian Optimization method
lgbm_bo = BayesianOptimization(
    lgbm_evaluate,
    {
        "max_depth": (3, 30),
        "min_data_in_leaf": (10, 100),
        "feature_fraction": (0.1, 0.9),
        "bagging_fraction": (0.1, 0.9),
        "learning_rate": (0.01, 0.3),
        "num_leaves": (20, 3000),
        "min_data_in_leaf": (200, 10000),
        "lambda_l1": (0, 100),
        "lambda_l2": (0, 100),
        "min_gain_to_split": (0, 15),
    },
)
lgbm_bo.maximize(init_points=10, n_iter=100)

|   iter    |  target   | baggin... | featur... | lambda_l1 | lambda_l2 | learni... | max_depth | min_da... | min_ga... | num_le... |
-------------------------------------------------------------------------------------------------------------------------------------
| [0m1        [0m | [0m-24.61   [0m | [0m0.5389   [0m | [0m0.6888   [0m | [0m57.81    [0m | [0m70.71    [0m | [0m0.1841   [0m | [0m22.81    [0m | [0m2.733e+03[0m | [0m9.712    [0m | [0m1.213e+03[0m |
| [0m2        [0m | [0m-36.6    [0m | [0m0.4924   [0m | [0m0.6678   [0m | [0m83.66    [0m | [0m60.58    [0m | [0m0.1444   [0m | [0m6.003    [0m | [0m7.256e+03[0m | [0m5.734    [0m | [0m2.48e+03 [0m |
| [95m3        [0m | [95m-23.34   [0m | [95m0.4365   [0m | [95m0.3532   [0m | [95m15.47    [0m | [95m62.8     [0m | [95m0.2128   [0m | [95m15.13    [0m | [95m1.466e+03[0m | [95m9.092    [0m | [95m1.581e+03[0m |
| [0m4        [0m | [0m-30.09   [0m | [0m0.4757   

In [16]:
lgbm_bo.max

{'target': -20.590819755716307,
 'params': {'bagging_fraction': 0.8690077395539522,
  'feature_fraction': 0.7100616390120457,
  'lambda_l1': 17.140478403668403,
  'lambda_l2': 43.36675440761465,
  'learning_rate': 0.06900334013323922,
  'max_depth': 5.299581326745951,
  'min_data_in_leaf': 210.02583586502402,
  'min_gain_to_split': 1.5177861953751233,
  'num_leaves': 2150.5332871592927}}