In [1]:
import pandas as pd
import numpy as np

In [2]:
# df_train = pd.read_csv("../dataset/train_cleaned_outliers_imputed.csv")
# df_train_id = df_train.pop("Id")
# df_train_target = df_train.pop("CO2 Emissions(g/km)")

# df_test = pd.read_csv("../dataset/test_cleaned_outliers_imputed.csv")
# df_test_id = df_test.pop("Id")

df_train = pd.read_csv("../dataset/train_cleaned_outliers_imputed_3.csv")
df_train_id = df_train.pop("Id")
df_train_target = df_train.pop("CO2 Emissions(g/km)")

df_test = pd.read_csv("../dataset/test_cleaned_outliers_imputed_3.csv")
df_test_id = df_test.pop("Id")

In [3]:
df_train.info()

onehot_columns = ["Make", "Fuel Type", "Transmission_Type", "Vehicle Class General", "Gears", "is_outlier"]
binary_columns = ["Vehicle Class", "Transmission"]
ordinal_columns = ["Vehicle Type"]

numerical_columns = df_train.select_dtypes(include='float64').columns.tolist()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54937 entries, 0 to 54936
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Make                   54937 non-null  object 
 1   Vehicle Class          54937 non-null  object 
 2   Engine Size(L)         54937 non-null  float64
 3   Cylinders              54937 non-null  float64
 4   Transmission           54937 non-null  object 
 5   Fuel Type              54937 non-null  object 
 6   Fuel Consumption City  54937 non-null  float64
 7   Fuel Consumption Hwy   54937 non-null  float64
 8   Fuel Consumption Comb  54937 non-null  float64
 9   Transmission_Type      54937 non-null  object 
 10  Gears                  54937 non-null  int64  
 11  Vehicle Class General  54937 non-null  object 
 12  Vehicle Type           54937 non-null  object 
 13  is_outlier             54937 non-null  int64  
dtypes: float64(5), int64(2), object(7)
memory usage: 5.9+ 

In [4]:
import functions as func

X_train = df_train.copy()
y_train = df_train_target.copy()

X_train, _ = func.encode_categorical_features(X_train, df_train.copy(), onehot_columns, binary_columns, ordinal_columns)
X_train, _ = func.standard_scale_dataframe(X_train, df_train.copy(), numerical_columns)

In [5]:
from bayes_opt import BayesianOptimization
import xgboost as xgb

# Load data
data = X_train.copy()
target = y_train.copy()


# Define the function to optimize
def xgb_evaluate(
    max_depth,
    gamma,
    colsample_bytree,
    subsample,
    eta,
    min_child_weight,
    max_delta_step,
    scale_pos_weight,
    reg_alpha,
    reg_lambda,
):
    params = {
        # "tree_method": "gpu_hist",
        
        "eval_metric": "rmse",
        "max_depth": int(max_depth),
        "subsample": subsample,
        "eta": eta,
        "gamma": gamma,
        "min_child_weight": min_child_weight,
        "colsample_bytree": colsample_bytree,
        "max_delta_step": max_delta_step,
        "scale_pos_weight": scale_pos_weight,
        "reg_alpha": reg_alpha,
        "reg_lambda": reg_lambda,
    }
    cv_result = xgb.cv(params, dtrain, num_boost_round=500, nfold=3, stratified=False)

    return -1.0 * cv_result["test-rmse-mean"].iloc[-1]


# Convert the data into a DMatrix
dtrain = xgb.DMatrix(data, label=target)

xgb_bo = BayesianOptimization(
    xgb_evaluate,
    {
        "max_depth": (1, 10),
        "gamma": (0, 1),
        "colsample_bytree": (0.3, 0.9),
        "subsample": (0.5, 1),
        "eta": (0.01, 0.3),
        "min_child_weight": (0.01, 0.3),
        "max_delta_step": (0, 10),
        "scale_pos_weight": (1, 100),
        "reg_alpha": (0, 1),
        "reg_lambda": (0, 1),
    },
)
xgb_bo.maximize(init_points=10, n_iter=100)

|   iter    |  target   | colsam... |    eta    |   gamma   | max_de... | max_depth | min_ch... | reg_alpha | reg_la... | scale_... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------------------
| [0m1        [0m | [0m-22.2    [0m | [0m0.4945   [0m | [0m0.1572   [0m | [0m0.4687   [0m | [0m9.08     [0m | [0m2.167    [0m | [0m0.2356   [0m | [0m0.7288   [0m | [0m0.4199   [0m | [0m26.11    [0m | [0m0.9867   [0m |
| [95m2        [0m | [95m-20.12   [0m | [95m0.8807   [0m | [95m0.2294   [0m | [95m0.4026   [0m | [95m2.029    [0m | [95m5.699    [0m | [95m0.2044   [0m | [95m0.9697   [0m | [95m0.692    [0m | [95m45.13    [0m | [95m0.9501   [0m |
| [95m3        [0m | [95m-19.71   [0m | [95m0.4095   [0m | [95m0.2289   [0m | [95m0.5065   [0m | [95m5.76     [0m | [95m5.71     [0m | [95m0.02362  [0m | [95m0.4632   [0m | [95m0.5864   [0m | 

In [6]:
xgb_bo.max

{'target': -19.245749039441947,
 'params': {'colsample_bytree': 0.3,
  'eta': 0.3,
  'gamma': 1.0,
  'max_delta_step': 4.910216813821979,
  'max_depth': 6.922199350830139,
  'min_child_weight': 0.3,
  'reg_alpha': 0.0,
  'reg_lambda': 1.0,
  'scale_pos_weight': 63.04094161253809,
  'subsample': 1.0}}

In [7]:
from bayes_opt import BayesianOptimization
import lightgbm as lgb
from sklearn.datasets import load_iris
from sklearn.metrics import mean_squared_error

# Load data
data = X_train.copy()
target = y_train.copy()


# Define the function to optimize
def lgbm_evaluate(
    max_depth,
    num_leaves,
    min_data_in_leaf,
    feature_fraction,
    bagging_fraction,
    learning_rate,
    lambda_l1,
    lambda_l2,
    min_gain_to_split,
):
    params = {
        # "device": "gpu",  # Add this line
        
        "objective": "regression",
        "metric": "rmse",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "max_depth": int(max_depth),
        "num_leaves": int(num_leaves),
        "min_data_in_leaf": int(min_data_in_leaf),
        "feature_fraction": feature_fraction,
        "bagging_fraction": bagging_fraction,
        "bagging_freq": 1,
        "learning_rate": learning_rate,
        "lambda_l1": int(lambda_l1),
        "lambda_l2": int(lambda_l2),
        "min_gain_to_split": int(min_gain_to_split),
    }
    train_data = lgb.Dataset(data, label=target)
    cv_result = lgb.cv(
        params, train_data, num_boost_round=1000, nfold=3, stratified=False
    )

    return -1.0 * cv_result["valid rmse-mean"][-1]


# Define the Bayesian Optimization method
lgbm_bo = BayesianOptimization(
    lgbm_evaluate,
    {
        "max_depth": (3, 30),
        "min_data_in_leaf": (10, 100),
        "feature_fraction": (0.1, 0.9),
        "bagging_fraction": (0.1, 0.9),
        "learning_rate": (0.01, 0.3),
        "num_leaves": (20, 3000),
        "min_data_in_leaf": (200, 10000),
        "lambda_l1": (0, 100),
        "lambda_l2": (0, 100),
        "min_gain_to_split": (0, 15),
    },
)
lgbm_bo.maximize(init_points=10, n_iter=100)

|   iter    |  target   | baggin... | featur... | lambda_l1 | lambda_l2 | learni... | max_depth | min_da... | min_ga... | num_le... |
-------------------------------------------------------------------------------------------------------------------------------------
| [0m1        [0m | [0m-67.57   [0m | [0m0.3179   [0m | [0m0.6108   [0m | [0m2.439    [0m | [0m97.71    [0m | [0m0.1302   [0m | [0m28.65    [0m | [0m8.261e+03[0m | [0m3.849    [0m | [0m2.559e+03[0m |
| [95m2        [0m | [95m-33.55   [0m | [95m0.7273   [0m | [95m0.6989   [0m | [95m11.51    [0m | [95m5.477    [0m | [95m0.05728  [0m | [95m13.24    [0m | [95m8.689e+03[0m | [95m0.2212   [0m | [95m2.494e+03[0m |
| [95m3        [0m | [95m-29.59   [0m | [95m0.7595   [0m | [95m0.7097   [0m | [95m13.88    [0m | [95m85.28    [0m | [95m0.03868  [0m | [95m10.53    [0m | [95m6.805e+03[0m | [95m12.87    [0m | [95m1.164e+03[0m |
| [95m4        [0m | [95m-25.12   [0m | 

In [8]:
lgbm_bo.max

{'target': -21.08389849478679,
 'params': {'bagging_fraction': 0.9,
  'feature_fraction': 0.9,
  'lambda_l1': 0.0,
  'lambda_l2': 95.35910249844048,
  'learning_rate': 0.3,
  'max_depth': 30.0,
  'min_data_in_leaf': 1903.4341444510599,
  'min_gain_to_split': 15.0,
  'num_leaves': 1820.4290154259495}}