In [None]:
from bayes_opt import BayesianOptimization
from sklearn.cluster import MiniBatchKMeans
from sklearn.preprocessing import MaxAbsScaler
from sklearn.model_selection import GroupKFold
import pandas as pd, numpy as np
import pickle
import lightgbm as lgb
from lightgbm import plot_importance
import warnings
warnings.filterwarnings('ignore')
seed = 21
np.random.seed(seed)

# Data with kmean sectors

In [None]:
with open("../input/optiver-dataset-with-kmeans/training_data.pkl", "rb") as f:
    train = pickle.load(f)

In [None]:
FOLDS = 5

In [None]:
# Function to calculate the root mean squared percentage error
def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))


# Function to early stop with root mean squared percentage error
def feval_rmspe(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'RMSPE', rmspe(y_true, y_pred), False

In [None]:
def lgbm_cv(learning_rate, num_leaves, feature_fraction, feature_fraction_bynode, bagging_fraction, bagging_freq, min_data_in_leaf, min_sum_hessian_in_leaf, lambda_l1, lambda_l2):
    params = {  'force_col_wise': True,
                'seed': seed,
                'feature_fraction_seed': seed,
                'bagging_seed': seed,
                'drop_seed': seed,
                'data_random_seed': seed,
                'objective': 'rmse',
                'boosting': 'gbdt',
                'verbosity': -1,
                'n_jobs': -1}
    params['learning_rate'] = max(min(learning_rate, 1), 0)
    params["num_leaves"] = int(round(num_leaves))
    params['feature_fraction'] = max(min(feature_fraction, 1), 0)
    params['feature_fraction_bynode'] = max(min(feature_fraction_bynode, 1), 0)
    params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
    params['min_data_in_leaf'] = int(round(min_data_in_leaf))
    params['min_sum_hessian_in_leaf'] = min_sum_hessian_in_leaf
    params['lambda_l1'] = lambda_l1
    params['lambda_l2'] = lambda_l2
    
    # Prepare Input Data
    groups = train['time_id']
    x = train.drop(["row_id", "time_id", "target"], axis= 1)
    y = train['target']
    
    # Transform stock id to numeric column
    x['stock_id'] = x['stock_id'].astype("int")
    
    # Prediction arrays
    val_pred = np.zeros(x.shape[0])
    
    # K-fold cross valiation
    kfold = GroupKFold(n_splits=FOLDS)
    for fold, (train_idx, val_idx) in enumerate(kfold.split(x, y, groups)):
        X_train, y_train = x.iloc[train_idx], y.iloc[train_idx]
        X_val, y_val     = x.iloc[val_idx]  , y.iloc[val_idx]
        
        # Add weights here
        train_weights = 1 / np.square(y_train)
        val_weights = 1 / np.square(y_val)
        
        # create lgb datasets
        train_data = lgb.Dataset(X_train, y_train, weight = train_weights, categorical_feature = ["stock_id"])
        val_data   = lgb.Dataset(X_val,   y_val, weight = val_weights,   categorical_feature = ["stock_id"])
        
        # Train model
        model = lgb.train(params = params,
                         train_set = train_data,
                         valid_sets = [train_data, val_data],
                         num_boost_round = 10000,
                         early_stopping_rounds = 100,
                         feval = feval_rmspe,
                         valid_names=["training", "validation"],
                         verbose_eval = 200)
        
        # Predict validation set
        val_pred[val_idx] = model.predict(X_val)
    
    return -rmspe(train['target'], val_pred)

In [None]:
pbounds = {'learning_rate': (0.01, 1.0),
        'num_leaves': (20, 1000),
        'feature_fraction': (0.1, 0.99),
        'feature_fraction_bynode': (0.1, 0.99),
        'bagging_fraction': (0.5, 1),
        'bagging_freq': (10, 100),
        'min_data_in_leaf': (20, 1000),
        'min_sum_hessian_in_leaf':(0,100),
       'lambda_l1': (0.5, 10),
        'lambda_l2': (0.5, 10)}

In [None]:
lgbm_bo = BayesianOptimization(f = lgbm_cv, pbounds = pbounds, verbose = 2, random_state = seed)

In [None]:
lgbm_bo.maximize(init_points = 2, n_iter = 20)

In [None]:
print(lgbm_bo.max)