<h2 style="color:#D198B7">Engine </h2>

<h3 style="color:#98D1B2 ">Load in required libraries</h3>

In [1]:
import numpy as np 
import pandas as pd 

from pandas_profiling import ProfileReport

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns; sns.set()
#from sklearn.externals import joblib

from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
import sklearn.datasets, sklearn.model_selection

#from xgboost import XGBRegressor

import gc
from itertools import product
import time

from bayes_opt import BayesianOptimization
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from bayes_opt.logger import JSONLogger
from bayes_opt.event import Events

import shap

<h3 style="color:#98D1B2 ">Create Custom Libraries</h3>

In [11]:
def train_validate_test_split(df, train_percent=.6, validate_percent=.2, seed=None):
    
    """
        Args:
            df:                 data frame to split into train, validate, split segments
            train_percent:      split for the train split of the df
            validate_percent:   split for the validation data
            seed:               set seed for replication

        Returns    
            Train Data
            Validation Data
            Test Data
    """
    
    np.random.seed(seed)
    perm = np.random.permutation(df.index)
    m = len(df.index)
    train_end = int(train_percent * m)
    validate_end = int(validate_percent * m) + train_end
    train = df.iloc[perm[:train_end]]
    validate = df.iloc[perm[train_end:validate_end]]
    test = df.iloc[perm[validate_end:]]
    return train, validate, test

In [15]:
# Bayesian Black Box we wish to optimize - hyperparameters

"""
    Args:
            None
            In practice would add in the option to set the parameters via this function

    Returns:    
            rmse - as above could refit this script to adapt to other metrics

    Notes:
            First draft

"""

def lgb_black_box(
      
    num_leaves,  # int
    min_data_in_leaf,  # int
    learning_rate,
    min_sum_hessian_in_leaf,    # int  
    feature_fraction,
    lambda_l1,
    lambda_l2,
    min_gain_to_split,
    max_depth):
    
    # lgb need some inputs as int but BayesianOptimization library send continuous values values. so we change type.

    num_leaves = int(num_leaves)
    min_data_in_leaf = int(min_data_in_leaf)
    max_depth = int(max_depth)
    
    # all this hyperparameter values are just for test. our goal in this kernel is how to use bayesian optimization
    # you can see lgb documentation for more info about hyperparameters
    params = {
        'num_leaves': num_leaves,
        'max_bin': 63,
        'min_data_in_leaf': min_data_in_leaf,
        'learning_rate': learning_rate,
        'min_sum_hessian_in_leaf': min_sum_hessian_in_leaf,
        'bagging_fraction': 1.0,
        'bagging_freq': 5,
        'feature_fraction': feature_fraction,
        'lambda_l1': lambda_l1,
        'lambda_l2': lambda_l2,
        'min_gain_to_split': min_gain_to_split,
        'max_depth': max_depth,
        'save_binary': True, 
        'seed': 1337,
        'feature_fraction_seed': 1337,
        'bagging_seed': 1337,
        'drop_seed': 1337,
        'data_random_seed': 1337,
        'objective': 'regression',
        'boosting_type': 'gbdt',
        'verbose': 1,
        'metric': 'rmse',
        'is_unbalance': True,
        'boost_from_average': False, 
    }
    
    # Self explanatory below
    
    train_data = lgb.Dataset(X_valid_train.iloc[bayesian_tr_index].values,
                            label = y_valid_train[bayesian_tr_index],
                            feature_name=predictors,
                            free_raw_data = False)
    
    
    validation_data = lgb.Dataset(X_valid_train.iloc[bayesian_val_index].values,
                                 label= y_valid_train[bayesian_val_index],
                                 feature_name=predictors,
                                 free_raw_data=False)
    
    num_round = 5000
    clf = lgb.train(params, train_data, num_round, valid_sets = [validation_data], verbose_eval=250,
                 early_stopping_rounds = 50)
    
    predictions = clf.predict(X_valid_train.iloc[bayesian_val_index].values,
                              num_iteration = clf.best_iteration)

#      we need to compute a regression score. roc_auc_score is a classification score. we can't use it
#     score = metrics.roc_auc_score(y_valid_train[bayesian_val_index], predictions)
    mse = mean_squared_error(y_valid_train[bayesian_val_index], predictions)
    rmse = np.sqrt(mse)
#     our bayesian optimization expect us to give it increasing number to understand this is getting better
    return -rmse

In [None]:
def column_index(df, query_cols):
    cols = df.columns.values
    sidx = np.argsort(cols)
    return sidx[np.searchsorted(cols, query_cols, sorter=sidx)]

In [9]:
# Base Parameter Ranges - for testing
LGB_bound = {
    "num_leaves" : (5, 20),
    "min_data_in_leaf" : (5, 20),
    "learning_rate" : (0.01, 0.3),
    "min_sum_hessian_in_leaf" : (0.00001, 0.01),
    "feature_fraction" : (0.05, 0.5),
    "lambda_l1" : (0, 5.0),
    "lambda_l2" : (0, 5.0),
    'min_gain_to_split': (0, 1.0),
    'max_depth':(3,15)
}

In [16]:
#  we have 3 parameters for this object. first is function. second is ranges. third is random_state (no matter)
optimizer = BayesianOptimization(
    f=lgb_black_box,
    pbounds = LGB_bound,
    random_state = 13
)
print(optimizer.space.keys)

['feature_fraction', 'lambda_l1', 'lambda_l2', 'learning_rate', 'max_depth', 'min_data_in_leaf', 'min_gain_to_split', 'min_sum_hessian_in_leaf', 'num_leaves']
