In [1]:
from bayes_opt import BayesianOptimization
from sklearn.model_selection import KFold
import pandas as pd, numpy as np
import pickle
import lightgbm as lgb
from catboost import Pool, CatBoostRegressor
from sklearn.metrics import mean_squared_error as mse
import warnings
warnings.filterwarnings('ignore')
seed = 21
np.random.seed(seed)

In [2]:
with open("../input/ba-training-data-3/training_data.pkl", "rb") as f:
    train_ = pickle.load(f)

In [3]:
X = train_.drop(['UNIQUE_IDENTIFIER', 'Y1', 'Y2'], axis=1)
y = train_[["Y1", "Y2"]]

In [4]:
from sklearn.preprocessing import LabelEncoder
le1 = LabelEncoder()
le2 = LabelEncoder()
X['CATEGORY_1'] = le1.fit_transform(X['CATEGORY_1'])
X['CATEGORY_2'] = le2.fit_transform(X['CATEGORY_2'])

# LightGBM

In [5]:
# Prediction arrays
val_pred1 = np.zeros(X.shape[0])
val_pred2 = np.zeros(X.shape[0])

FOLDS = 5

def lgbm_cv1(learning_rate, num_leaves, feature_fraction, feature_fraction_bynode, bagging_fraction, bagging_freq, min_data_in_leaf, min_sum_hessian_in_leaf, lambda_l1, lambda_l2):
    params = {  'seed': seed,
                'feature_fraction_seed': seed,
                'bagging_seed': seed,
                'drop_seed': seed,
                'data_random_seed': seed,
                'objective': 'rmse',
                'boosting': 'gbdt',
                'verbosity': -1,
                'n_jobs': -1}
    params['learning_rate'] = max(min(learning_rate, 1), 0)
    params["num_leaves"] = int(round(num_leaves))
    params['feature_fraction'] = max(min(feature_fraction, 1), 0)
    params['feature_fraction_bynode'] = max(min(feature_fraction_bynode, 1), 0)
    params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
    params['min_data_in_leaf'] = int(round(min_data_in_leaf))
    params['min_sum_hessian_in_leaf'] = min_sum_hessian_in_leaf
    params['lambda_l1'] = lambda_l1
    params['lambda_l2'] = lambda_l2
    
    # K-fold cross valiation
    kfold = KFold(n_splits=FOLDS)
    for fold, (train_idx, val_idx) in enumerate(kfold.split(X, y)):
        print("#"*40)
        print("Training Fold - ", fold+1)
        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        X_val, y_val     = X.iloc[val_idx]  , y.iloc[val_idx]

        # create lgb datasets
        train_data1 = lgb.Dataset(X_train, y_train.iloc[:,0], categorical_feature = ["STATUS_CHECK", "CATEGORY_1", "CATEGORY_2"])
        val_data1   = lgb.Dataset(X_val,   y_val.iloc[:,0],   categorical_feature = ["STATUS_CHECK", "CATEGORY_1", "CATEGORY_2"])

        # Train model
        print("#"*20)
        print("Training for Y1")
        model1 = lgb.train(params = params,
                         train_set = train_data1,
                         valid_sets = [train_data1, val_data1],
                         num_boost_round = 100000,
                         valid_names=["training", "validation"],
                         early_stopping_rounds = 1000,
                         verbose_eval = 500)

        # Predict validation set
        val_pred1[val_idx] = model1.predict(X_val)
    
    return -np.sqrt(mse(train_["Y1"], val_pred1))

In [6]:
pbounds1 = {'learning_rate': (0.1, 0.4),
        'num_leaves': (300, 800),
        'feature_fraction': (0.6, 0.99),
        'feature_fraction_bynode': (0.1, 0.6),
        'bagging_fraction': (0.4, 1),
        'bagging_freq': (60, 100),
        'min_data_in_leaf': (100, 500),
        'min_sum_hessian_in_leaf':(30, 80),
       'lambda_l1': (3, 10),
        'lambda_l2': (0.01, 2)}

In [7]:
lgbm_bo1 = BayesianOptimization(f = lgbm_cv1, pbounds = pbounds1, verbose = 2, random_state = seed)

lgbm_bo1.maximize(init_points = 5, n_iter = 20)

|   iter    |  target   | baggin... | baggin... | featur... | featur... | lambda_l1 | lambda_l2 | learni... | min_da... | min_su... | num_le... |
-------------------------------------------------------------------------------------------------------------------------------------------------
########################################
Training Fold -  1
####################
Training for Y1
Training until validation scores don't improve for 1000 rounds
[500]	training's rmse: 5.21752	validation's rmse: 6.11987
[1000]	training's rmse: 4.25562	validation's rmse: 6.31366
Early stopping, best iteration is:
[16]	training's rmse: 6.99874	validation's rmse: 5.63904
########################################
Training Fold -  2
####################
Training for Y1
Training until validation scores don't improve for 1000 rounds
[500]	training's rmse: 5.29466	validation's rmse: 5.62451
[1000]	training's rmse: 4.32358	validation's rmse: 5.82598
Early stopping, best iteration is:
[57]	training's rmse: 6.732

In [8]:
print(lgbm_bo1.max)

{'target': -6.715773763699902, 'params': {'bagging_fraction': 0.7446132365786178, 'bagging_freq': 94.57044996418587, 'feature_fraction': 0.9012097906532421, 'feature_fraction_bynode': 0.2724293678054648, 'lambda_l1': 6.697079462213692, 'lambda_l2': 0.05709330859845584, 'learning_rate': 0.17588487226531224, 'min_data_in_leaf': 154.15270440199714, 'min_sum_hessian_in_leaf': 58.37985742493366, 'num_leaves': 622.1019844986934}}


In [9]:
def lgbm_cv2(learning_rate, num_leaves, feature_fraction, feature_fraction_bynode, bagging_fraction, bagging_freq, min_data_in_leaf, min_sum_hessian_in_leaf, lambda_l1, lambda_l2):
    params = {  'seed': seed,
                'feature_fraction_seed': seed,
                'bagging_seed': seed,
                'drop_seed': seed,
                'data_random_seed': seed,
                'objective': 'rmse',
                'boosting': 'gbdt',
                'verbosity': -1,
                'n_jobs': -1}
    params['learning_rate'] = max(min(learning_rate, 1), 0)
    params["num_leaves"] = int(round(num_leaves))
    params['feature_fraction'] = max(min(feature_fraction, 1), 0)
    params['feature_fraction_bynode'] = max(min(feature_fraction_bynode, 1), 0)
    params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
    params['min_data_in_leaf'] = int(round(min_data_in_leaf))
    params['min_sum_hessian_in_leaf'] = min_sum_hessian_in_leaf
    params['lambda_l1'] = lambda_l1
    params['lambda_l2'] = lambda_l2
    
    # K-fold cross valiation
    kfold = KFold(n_splits=FOLDS)
    for fold, (train_idx, val_idx) in enumerate(kfold.split(X, y)):
        print("#"*40)
        print("Training Fold - ", fold+1)
        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        X_val, y_val     = X.iloc[val_idx]  , y.iloc[val_idx]

        # create lgb datasets
        train_data2 = lgb.Dataset(X_train, y_train.iloc[:,1], categorical_feature = ["STATUS_CHECK", "CATEGORY_1", "CATEGORY_2"])
        val_data2   = lgb.Dataset(X_val,   y_val.iloc[:,1],   categorical_feature = ["STATUS_CHECK", "CATEGORY_1", "CATEGORY_2"])
        
        print("#"*20)
        print("Training for Y2")
        model2 = lgb.train(params = params,
                         train_set = train_data2,
                         valid_sets = [train_data2, val_data2],
                         num_boost_round = 100000,
                         valid_names=["training", "validation"],
                         early_stopping_rounds = 1000,
                         verbose_eval = 500)

        # Predict validation set
        val_pred2[val_idx] = model2.predict(X_val)
    
    return -np.sqrt(mse(train_["Y2"], val_pred2))

In [10]:
pbounds2 = {'learning_rate': (0.05, 0.3),
        'num_leaves': (300, 700),
        'feature_fraction': (0.1, 0.5),
        'feature_fraction_bynode': (0.5, 0.99),
        'bagging_fraction': (0.01, 1),
        'bagging_freq': (10, 70),
        'min_data_in_leaf': (400, 1000),
        'min_sum_hessian_in_leaf':(5, 50),
       'lambda_l1': (0.1, 10),
        'lambda_l2': (0.1, 10)}

In [11]:
lgbm_bo2 = BayesianOptimization(f = lgbm_cv2, pbounds = pbounds2, verbose = 2, random_state = seed)

lgbm_bo2.maximize(init_points = 5, n_iter = 20)

|   iter    |  target   | baggin... | baggin... | featur... | featur... | lambda_l1 | lambda_l2 | learni... | min_da... | min_su... | num_le... |
-------------------------------------------------------------------------------------------------------------------------------------------------
########################################
Training Fold -  1
####################
Training for Y2
Training until validation scores don't improve for 1000 rounds
[500]	training's rmse: 100.757	validation's rmse: 120.914
[1000]	training's rmse: 88.6385	validation's rmse: 122.26
Early stopping, best iteration is:
[123]	training's rmse: 113.973	validation's rmse: 119.762
########################################
Training Fold -  2
####################
Training for Y2
Training until validation scores don't improve for 1000 rounds
[500]	training's rmse: 101.119	validation's rmse: 120.573
[1000]	training's rmse: 89.0368	validation's rmse: 122.039
Early stopping, best iteration is:
[113]	training's rmse: 114.

In [12]:
print(lgbm_bo2.max)

{'target': -120.62179612097559, 'params': {'bagging_fraction': 0.01, 'bagging_freq': 54.41390829728485, 'feature_fraction': 0.5, 'feature_fraction_bynode': 0.5, 'lambda_l1': 0.1, 'lambda_l2': 0.1, 'learning_rate': 0.05, 'min_data_in_leaf': 739.0056075760633, 'min_sum_hessian_in_leaf': 14.315119859932201, 'num_leaves': 517.5526707826266}}


# CatBoost

In [13]:
# # Prediction arrays
# val_pred1 = np.zeros(X.shape[0])
# val_pred2 = np.zeros(X.shape[0])

In [14]:
# FOLDS = 5

In [15]:
# def cat_cv1(iterations, learning_rate, depth, l2_leaf_reg, colsample_bylevel):
#     params = {  "loss_function" : "RMSE",
#                 "eval_metric" : "RMSE", 
#                 "use_best_model" : True,
#                 "random_seed" : seed,}
#     params['iterations'] = int(round(iterations))
#     params['learning_rate'] = max(min(learning_rate, 1), 0)
#     params["depth"] = int(round(depth))
#     params['l2_leaf_reg'] = l2_leaf_reg
#     params['colsample_bylevel'] = max(min(colsample_bylevel, 1), 0)
    
#     # K-fold cross valiation
#     kfold = KFold(n_splits=FOLDS)
#     for fold, (train_idx, val_idx) in enumerate(kfold.split(X, y)):
#         print("#"*60)
#         print("Training Fold - ", fold+1)
#         X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
#         X_val, y_val     = X.iloc[val_idx]  , y.iloc[val_idx]

#         # create lgb datasets
#         train_data1 = Pool(X_train, y_train.iloc[:,0], cat_features = ["STATUS_CHECK", "CATEGORY_1", "CATEGORY_2"])
#         val_data1   = Pool(X_val,   y_val.iloc[:,0],   cat_features = ["STATUS_CHECK", "CATEGORY_1", "CATEGORY_2"])

#         # Train model
#         print("#"*20)
#         print("Training for Y1")
#         model1 = CatBoostRegressor(iterations = params['iterations'],
#                          depth = params['depth'],
#                          learning_rate = params['learning_rate'],
#                          loss_function = params['loss_function'],
#                          eval_metric = params['eval_metric'],
#                          use_best_model = params['use_best_model'],
#                          random_seed = params['random_seed'],
#                          l2_leaf_reg = params['l2_leaf_reg'],
#                          colsample_bylevel = params['colsample_bylevel'],
#                          early_stopping_rounds = 1000)
#         model1.fit(train_data1,
#                    eval_set=val_data1,
#                    verbose_eval=200)

#         # Predict validation set
#         val_pred1[val_idx] = model1.predict(X_val)
    
#     return -np.sqrt(mse(train_["Y1"], val_pred1))

In [16]:
# pbounds1 = {'iterations': (1000, 100000),
#         'learning_rate': (0.001, 1.0),
#         'depth': (1, 16),
#         'l2_leaf_reg': (0.5, 100),
#         'colsample_bylevel': (0.1, 0.99)}

In [17]:
# cat_bo1 = BayesianOptimization(f = cat_cv1, pbounds = pbounds1, verbose = 2, random_state = seed)

In [18]:
# cat_bo1.maximize(init_points = 3, n_iter = 10)

In [19]:
# print(cat_bo1.max)

In [20]:
# def cat_cv2(iterations, learning_rate, depth, l2_leaf_reg, colsample_bylevel):
#     params = {  "loss_function" : "RMSE",
#                 "eval_metric" : "RMSE", 
#                 "use_best_model" : True,
#                 "random_seed" : seed,}
#     params['iterations'] = int(round(iterations))
#     params['learning_rate'] = max(min(learning_rate, 1), 0)
#     params["depth"] = int(round(depth))
#     params['l2_leaf_reg'] = l2_leaf_reg
#     params['colsample_bylevel'] = max(min(colsample_bylevel, 1), 0)
    
#     # K-fold cross valiation
#     kfold = KFold(n_splits=FOLDS)
#     for fold, (train_idx, val_idx) in enumerate(kfold.split(X, y)):
#         print("#"*60)
#         print("Training Fold - ", fold+1)
#         X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
#         X_val, y_val     = X.iloc[val_idx]  , y.iloc[val_idx]

#         # create lgb datasets
#         train_data2 = Pool(X_train, y_train.iloc[:,1], cat_features = ["STATUS_CHECK", "CATEGORY_1", "CATEGORY_2"])
#         val_data2   = Pool(X_val,   y_val.iloc[:,1],   cat_features = ["STATUS_CHECK", "CATEGORY_1", "CATEGORY_2"])

#         # Train model
#         print("#"*20)
#         print("Training for Y2")
#         model2 = CatBoostRegressor(iterations = params['iterations'],
#                          depth = params['depth'],
#                          learning_rate = params['learning_rate'],
#                          loss_function = params['loss_function'],
#                          eval_metric = params['eval_metric'],
#                          use_best_model = params['use_best_model'],
#                          random_seed = params['random_seed'],
#                          l2_leaf_reg = params['l2_leaf_reg'],
#                          colsample_bylevel = params['colsample_bylevel'],
#                          early_stopping_rounds = 1000)
#         model2.fit(train_data2,
#                    eval_set=val_data2,
#                    verbose_eval=200)

#         # Predict validation set
#         val_pred2[val_idx] = model2.predict(X_val)
    
#     return -np.sqrt(mse(train_["Y2"], val_pred2))

In [21]:
# pbounds2 = {'iterations': (1000, 100000),
#         'learning_rate': (0.001, 1.0),
#         'depth': (1, 16),
#         'l2_leaf_reg': (0.5, 100),
#         'colsample_bylevel': (0.1, 0.99)}

In [22]:
# cat_bo2 = BayesianOptimization(f = cat_cv2, pbounds = pbounds2, verbose = 2, random_state = seed)

In [23]:
# cat_bo2.maximize(init_points = 3, n_iter = 10)

In [24]:
# print(cat_bo2.max)