# XGBoost with Random Grid Search
### XGBClassifier (Sklearn API)

### No need xgboost.DMatrix (light gbm matrix format)

In [2]:
import pandas as pd
import numpy as np
import time
from xgboost import XGBClassifier

SEED = 123
NTHREAD = 4

In [None]:
def xgb_grid(train, valid, mono_constraints=None, gs_params=None, early_stopping_rounds=None, n_models=None,
             ntree=None, verbose=None, seed=None):
    
    """ Performs a random grid search over n_models and gs_params.

    :param dtrain: Training data in LightSVM format.
    :param dvalid: Validation data in LightSVM format.
    :param mono_constraints: User-supplied monotonicity constraints.
    :param gs_params: Dictionary of lists of potential XGBoost parameters over which to search.
    :param n_models: Number of random models to evaluate.
    :param ntree: Number of trees in XGBoost model.
    :param early_stopping_rounds: XGBoost early stopping rounds.
    :param verbose: Whether to display training iterations, default False.
    :param seed: Random seed for better interpretability.
    :return: Best candidate model from random grid search.

    """

    # cartesian product of gs_params
    keys, values = zip(*gs_params.items())
    experiments = [dict(zip(keys, v)) for v in itertools.product(*values)]

    # preserve exact reproducibility for this function
    np.random.seed(SEED) 
    
    # select randomly from cartesian product space
    selected_experiments = np.random.choice(len(experiments), n_models)

    # set global params for objective,  etc.
    params = {'booster': 'gbtree',
              'n_jobs': NTHREAD,
              'objective': 'binary:logistic', 
              'n_estimators':ntree,
              'seed': SEED}

    # init grid search loop
    best_candidate = None
    best_score = 0

    # grid search loop
    for i, exp in enumerate(selected_experiments):

        params.update(experiments[exp])  # override global params with current grid run params

        print('Grid search run %d/%d:' % (int(i + 1), int(n_models)))
        print('Training with parameters:', params)

        # train on current params
        watchlist = [(train[X_var], train[y_var]), (valid[X_var], valid[y_var])]
        
        if mono_constraints is not None:
            params['monotone_constraints'] = mono_constraints
        
        model = XGBClassifier(**params
                             )   
        
        candidate = model.fit(train[X_var], train[y_var], 
                              early_stopping_rounds = early_stopping_rounds, 
                              eval_set = watchlist, 
                              eval_metric='auc',
                              verbose=verbose)

        # determine if current model is better than previous best
        eval_result = candidate.eval_result()
        eval_od = list(eval_result.values())
        for key, values in eval_od.items():
            present_best_score = np.max(values)
            
        if present_best_score > best_score:
            best_candidate = candidate
            best_score = present_best_score
            print('Grid search new best score discovered at iteration %d/%d: %.4f.' %
                             (int(i + 1), int(n_models), present_best_score))

        print('---------- ----------')
            
    return best_candidate

In [None]:
# dictionary of hyperparameter value lists for grid search
gs_params = {'colsample_bytree': [0.3, 0.5, 0.7, 0.9],
             'colsample_bylevel': [0.3, 0.5, 0.7, 0.9],
             'learning_rate': [0.005, 0.05, 0.5],
             'max_depth': [3, 5, 7], 
             'reg_alpha': [0.0005, 0.005, 0.05],
             'reg_lambda': [0.0005, 0.005, 0.05],
             'subsample': [0.3, 0.5, 0.7, 0.9],
             'min_child_weight': [1, 5, 10], 
             'gamma': [0.0, 0.1, 0.2 , 0.3, 0.4]}

#define monotonicity constraints
#train_corr_y = train[X_var + [y_var]].corr()[y_var].values[:-1]
#train_corr_y[np.isnan(train_corr_y)] = 1
#mono_constraints = tuple(int(i) for i in np.sign(train_corr_y))

# start local timer
mxgb_tic = time.time()

# Monotonic XGBoost grid search                        
best_mxgb = xgb_grid(train, valid, 
                     gs_params=gs_params, 
                     n_models=20, ntree=1000, 
                     early_stopping_rounds=100,
                     verbose=False,
                     seed=SEED)

# end local timer
mxgb_toc = time.time() - mxgb_tic
print('Monotonic GBM training completed in %.2f s.' % (mxgb_toc))