In [1]:
__version__ = '2019-08-04'
__DEBUG_MODE__ = True

In [37]:
from  more_itertools import unique_everseen
import pandas as pd
import numpy as np
import warnings

def xgrid_search(space, xgrid_search_objective, 
                 MAX_RESOLUTION=20, MAX_GRIDS_PER_RESOL=50, MAX_NUM_MIN_GRIDS=100000, MIN_REL_ERROR = 1e-5,
                 THRESHOLD_NUM_BEST_GRIDS = 1):
    def increase_para_resolution(ptype, spara):
        def divide_intervals(para_type, v):
            v = np.array(v)
            if   para_type == 'uni': vv = np.full(2*len(v)-1, np.nan); vv[0::2] = v; vv[1::2] = (v[1:]+v[:-1])/2
            elif para_type == 'log': vv = np.full(2*len(v)-1, np.nan); vv[0::2] = v; vv[1::2] = np.sqrt(v[1:]*v[:-1])
            elif para_type == 'int': vv = np.full(2*len(v)-1, 0);      vv[0::2] = v; vv[1::2] = np.around((v[1:]+v[:-1])/2).astype(int)
            return np.unique(vv) # numpy returns sorted data
        spara = {k:divide_intervals(ptype[k], v) for k,v in spara.items()} # update with more intervals
        return spara

    def generate_xgrids(spara, grid): # generate new grids from best grid
        xgrids = []
        pairs = []
        for i,v in enumerate(spara.values()):
            n = np.where(v == grid[i])[0][0]
            if 0 <= (n-1):     copy = list(grid); copy[i] = v[n-1]; xgrids.append(tuple(copy)); pairs.append(i)
            if (n+1) < len(v): copy = list(grid); copy[i] = v[n+1]; xgrids.append(tuple(copy)); pairs.append(i)
        return xgrids, pairs

    def find_all_best_grids(grids, fmins):
        min_value = min(fmins)
        best_grids = [grids[i] for i, v in enumerate(fmins) if v == min_value]
        return best_grids, min_value

    def generate_neighbors(spara, grids):
        ngrids = []
        for grid in grids:
            xgrids, _ = generate_xgrids(spara, grid)
            ngrids.extend(xgrids) # xgrid search
        ngrids = list(set(ngrids).difference(grids)) # remove the best points
        return ngrids
    
    def grid_to_param(spara, grid):
        param = {k:grid[i] for i,k in enumerate(spara.keys())}
        return param

    def estimate_f_from_existing_neighbors(grids0, spara, grids, fmins):
        grids_est, fmins_est = [], []
        for grid in grids0:
            xgrids, pairs = generate_xgrids(spara, grid)
            evals = [fmins[grids.index(x)] if x in grids else np.nan for x in xgrids]
            for i in range(len(grid)):
                fval = [evals[n] for n,p in enumerate(pairs) if p==i]
                if len(fval) == 2:
                    if (not np.isnan(fval[0])) and (not np.isnan(fval[1])) and (fval[0] == fval[1]):
                        grids_est.append(grid)
                        fmins_est.append(fval[0])
                        break;
        grids0 = list(set(grids0).difference(grids_est))
        return grids0, grids_est, fmins_est

    def choose_best_of_best_grids(bgrids, grids0, spara, grids, actives):
        num_branches, grp_xgrids0, bbgrids = [], [], []
        for i, bgrid in enumerate(bgrids):
            if actives[grids.index(bgrid)]:
                xgrids, _ = generate_xgrids(spara, bgrid)
                xgrids0 = list(set(xgrids).intersection(grids0))

                num_branches.append(len(xgrids0))
                grp_xgrids0.append(xgrids0)
                bbgrids.append(bgrid)
            
        max_num_branshes, grids00, cnt = max(num_branches), [], 1
        for i, bbgrid in enumerate(bbgrids):
            if (num_branches[i] == max_num_branshes) and (cnt <= THRESHOLD_NUM_BEST_GRIDS):
                grids00.extend(grp_xgrids0[i])
                cnt = cnt + 1
            else:
                actives[grids.index(bbgrid)] = 0
        grids00 = list(set(grids00))
        
        return grids00, actives
    
    def update_grids_etc(grids, fmins, actives, grids0, fmins0, actives0):
        grids.extend(grids0)
        fmins.extend(fmins0)
        actives.extend(actives0)
        return grids, fmins, actives
    
    #------------------------------------------------------------------
    # initial preparation
    space = {k:(v[0], list(unique_everseen(v[1]))) for k,v in space.items()} # unique_everseen: remove any possible duplicates

    ptype = {k:v[0] for k,v in space.items()} # data type
    param = {k:v[1] for k,v in space.items()} # data list

    spara = {k:np.unique(v) for k,v in param.items()} # sorted data list

    combi = pd.DataFrame(index = pd.MultiIndex.from_product(param.values(), names = param.keys())).reset_index() # https://stackoverflow.com/questions/13269890/cartesian-product-in-pandas
    grids0 = list(unique_everseen([tuple(row[1:]) for row in combi.itertuples()])) # list(unique_everseen()): make sure coordinates are unique
    
    #------------------------------------------------------------------
    grids, fmins, actives, cnt = [], [], [], 0
    for n in range(MAX_RESOLUTION):
        for m in range(MAX_GRIDS_PER_RESOL):
            fmins0, actives0 = [], []
            for grid in grids0:
                fmins0.append(xgrid_search_objective(**{k:grid[i] for i,k in enumerate(spara.keys())})) # create fmins0
                actives0.append(1)
                cnt = cnt + 1
                print('%s (r=%s/%s; g=%s/%s); %s; score=%s'%(cnt,n+1,MAX_RESOLUTION,m+1,MAX_GRIDS_PER_RESOL,grid_to_param(spara, grid),fmins0[-1]))
            grids, fmins, actives = update_grids_etc(grids, fmins, actives, grids0, fmins0, actives0)

            bgrids, min_value = find_all_best_grids(grids, fmins) # best grids
            print('> One of %s best param(s) = %s; best score = %s'%(len(bgrids), grid_to_param(spara, bgrids[0]), min_value))
            ngrids = generate_neighbors(spara, bgrids) # neighboring grids
            grids0 = list(set(ngrids).difference(grids)) # remove the existing grids from the neighboring grids

#             grids0, actives = choose_best_of_best_grids(bgrids, grids0, spara, grids, actives)
            
            grids0, grids_est, fmins_est = estimate_f_from_existing_neighbors(grids0, spara, grids, fmins)
            actives_est = [1 for g in grids_est]
            grids, fmins, actives = update_grids_etc(grids, fmins, actives, grids_est, fmins_est, actives_est)
            if len(grids_est) > 0: print('> The function values of %s points are estimated and added without calculation'%(len(grids_est)))

            if len(grids0) == 0: break
            elif m+1 == MAX_GRIDS_PER_RESOL: print('BRAD WARNING:: MAX_GRIDS_PER_RESOL = %s is reached...'%MAX_GRIDS_PER_RESOL)
        
        # stop condition
        if (n > 0) and (set(bgrids) == set(bgrids_prev)): # if the best grid points are the same as the previous ones
            neighbors = set(ngrids).difference(bgrids).difference(set(ngrids_prev).difference(bgrids_prev)) # exclude the same previous neighboring grid points
        else:
            neighbors = set(ngrids).difference(bgrids) # neighboring grid points
        ngrids_prev, bgrids_prev = ngrids, bgrids # back up for the next round
        fmin_neighbors = np.array([fmins[i] for i,g in enumerate(grids) if g in neighbors]) # fmin values of the neighboring grid points
        rel_error = np.max(np.abs((fmin_neighbors - min_value) / min_value)) if len(np.array(fmin_neighbors)) > 0 else 0
        
        if rel_error < MIN_REL_ERROR: break
        elif len(bgrids) > MAX_NUM_MIN_GRIDS: print('BRAD WARNING:: MAX_NUM_MIN_GRIDS = %s is reached...'%MAX_NUM_MIN_GRIDS); break
        elif ((n+1 == MAX_RESOLUTION)): print('BRAD WARNING:: MAX_RESOLUTION = %s is reached...'%MAX_RESOLUTION); break
        else: 
            spara = increase_para_resolution(ptype, spara)
            
            bgrids, _ = find_all_best_grids(grids, fmins) # best grids
            ngrids = generate_neighbors(spara, bgrids)
            grids0 = list(set(ngrids).difference(grids)) # remove if newly found poins are already in the list
            
            grids0, grids_est, fmins_est = estimate_f_from_existing_neighbors(grids0, spara, grids, fmins)
            actives_est = [1 for g in grids_est]
            grids, fmins, actives = update_grids_etc(grids, fmins, actives, grids_est, fmins_est, actives_est)
            if len(grids_est) > 0: print('> The function values of %s points are estimated and added without calculation'%(len(grids_est)))
                
#             grids0, actives = choose_best_of_best_grids(bgrids, grids0, spara, grids, actives)

    best_params = [grid_to_param(spara, best) for best in find_all_best_grids(grids, fmins)[0]]
    return best_params, grids, fmins

#####################################################################
# space = { # define the data type and input varaible range
#     'z': ['uni', [-10, -1, 10]],  # uni / log / int
#     'x': ['int', [-10, 10]], # uni / log / int
# #     'x': ['uni', [-10, 10]], # uni / log / int
#     'y': ['log', [0, 0.01, 10]], # uni / log / int
# }

# def xgrid_search_objective(**param):
#     #--- STRAT user definition ---
#     x,y,z = param['x'],param['y'],param['z'] 
#     output = ((z+2.6)**2) + ((x-2.7)**2) + ((y-3.3)**2) + 1.0
#     #--- END user definition ---
#     return output
#------------------------------------------------------------------
space = { # define the data type and input varaible range
    'x': ['uni', [-10, 1, 10]], # uni / log / int
    'y': ['uni', [-10, 10]], # uni / log / int
}

def xgrid_search_objective(**param):
    #--- STRAT user definition ---
    x,y = param['x'],param['y']
#     x = param['x']
#     output = ((x-2.7)**2) + ((y-5.3)**2) + 1.0
    if x>0:
        output = x*(x-10)
#         output = x*(x)
    else:
        output = 0
    output = output + 1.0
    #--- END user definition ---
    return output
#------------------------------------------------------------------

best_params, grids, fmins = xgrid_search(space, xgrid_search_objective)
best_params[0]

1 (r=1/20; g=1/50); {'x': -10, 'y': -10}; score=1.0
2 (r=1/20; g=1/50); {'x': -10, 'y': 10}; score=1.0
3 (r=1/20; g=1/50); {'x': 1, 'y': -10}; score=-8.0
4 (r=1/20; g=1/50); {'x': 1, 'y': 10}; score=-8.0
5 (r=1/20; g=1/50); {'x': 10, 'y': -10}; score=1.0
6 (r=1/20; g=1/50); {'x': 10, 'y': 10}; score=1.0
> One of 2 best param(s) = {'x': 1, 'y': -10}; best score = -8.0
> The function values of 1 points are estimated and added without calculation
7 (r=2/20; g=1/50); {'x': 5.5, 'y': 10}; score=-23.75
8 (r=2/20; g=1/50); {'x': 5.5, 'y': -10}; score=-23.75
9 (r=2/20; g=1/50); {'x': -4.5, 'y': 10}; score=1.0
10 (r=2/20; g=1/50); {'x': -4.5, 'y': -10}; score=1.0
> One of 2 best param(s) = {'x': 5.5, 'y': 10}; best score = -23.75
> The function values of 1 points are estimated and added without calculation
> The function values of 2 points are estimated and added without calculation
11 (r=3/20; g=1/50); {'x': 7.75, 'y': 0.0}; score=-16.4375
12 (r=3/20; g=1/50); {'x': 3.25, 'y': -10}; score=-20.

> The function values of 128 points are estimated and added without calculation
275 (r=9/20; g=1/50); {'x': 5.04296875, 'y': -1.25}; score=-23.998153686523438
276 (r=9/20; g=1/50); {'x': 5.04296875, 'y': -1.875}; score=-23.998153686523438
277 (r=9/20; g=1/50); {'x': 5.04296875, 'y': -1.5625}; score=-23.998153686523438
278 (r=9/20; g=1/50); {'x': 4.97265625, 'y': 10}; score=-23.999252319335938
279 (r=9/20; g=1/50); {'x': 4.97265625, 'y': 5.0}; score=-23.999252319335938
280 (r=9/20; g=1/50); {'x': 4.97265625, 'y': 5.625}; score=-23.999252319335938
281 (r=9/20; g=1/50); {'x': 4.97265625, 'y': 5.3125}; score=-23.999252319335938
282 (r=9/20; g=1/50); {'x': 4.97265625, 'y': 5.9375}; score=-23.999252319335938
283 (r=9/20; g=1/50); {'x': 4.97265625, 'y': 5.15625}; score=-23.999252319335938
284 (r=9/20; g=1/50); {'x': 4.97265625, 'y': 5.46875}; score=-23.999252319335938
285 (r=9/20; g=1/50); {'x': 4.97265625, 'y': 5.78125}; score=-23.999252319335938
286 (r=9/20; g=1/50); {'x': 5.04296875, 'y': 

> The function values of 256 points are estimated and added without calculation
533 (r=10/20; g=1/50); {'x': 5.025390625, 'y': -1.25}; score=-23.99935531616211
534 (r=10/20; g=1/50); {'x': 5.025390625, 'y': -1.875}; score=-23.99935531616211
535 (r=10/20; g=1/50); {'x': 5.025390625, 'y': -1.5625}; score=-23.99935531616211
536 (r=10/20; g=1/50); {'x': 5.025390625, 'y': -1.40625}; score=-23.99935531616211
537 (r=10/20; g=1/50); {'x': 4.990234375, 'y': 5.0}; score=-23.99990463256836
538 (r=10/20; g=1/50); {'x': 4.990234375, 'y': 5.625}; score=-23.99990463256836
539 (r=10/20; g=1/50); {'x': 4.990234375, 'y': 5.3125}; score=-23.99990463256836
540 (r=10/20; g=1/50); {'x': 4.990234375, 'y': 5.9375}; score=-23.99990463256836
541 (r=10/20; g=1/50); {'x': 4.990234375, 'y': 5.15625}; score=-23.99990463256836
542 (r=10/20; g=1/50); {'x': 4.990234375, 'y': 5.46875}; score=-23.99990463256836
543 (r=10/20; g=1/50); {'x': 4.990234375, 'y': 5.78125}; score=-23.99990463256836
544 (r=10/20; g=1/50); {'x':

> The function values of 512 points are estimated and added without calculation
> The function values of 512 points are estimated and added without calculation
1047 (r=11/20; g=1/50); {'x': 5.0166015625, 'y': -1.6015625}; score=-23.99972438812256
1048 (r=11/20; g=1/50); {'x': 5.0166015625, 'y': -1.796875}; score=-23.99972438812256
1049 (r=11/20; g=1/50); {'x': 5.0166015625, 'y': -1.40625}; score=-23.99972438812256
1050 (r=11/20; g=1/50); {'x': 5.0166015625, 'y': -0.859375}; score=-23.99972438812256
1051 (r=11/20; g=1/50); {'x': 5.0166015625, 'y': 0.703125}; score=-23.99972438812256
1052 (r=11/20; g=1/50); {'x': 5.0166015625, 'y': 0.078125}; score=-23.99972438812256
1053 (r=11/20; g=1/50); {'x': 5.0166015625, 'y': -0.546875}; score=-23.99972438812256
1054 (r=11/20; g=1/50); {'x': 4.9990234375, 'y': -7.8125}; score=-23.999999046325684
1055 (r=11/20; g=1/50); {'x': 4.9990234375, 'y': -7.265625}; score=-23.999999046325684
1056 (r=11/20; g=1/50); {'x': 4.9990234375, 'y': -7.109375}; score=-

1639 (r=11/20; g=1/50); {'x': 5.0166015625, 'y': -6.8359375}; score=-23.99972438812256
1640 (r=11/20; g=1/50); {'x': 5.0166015625, 'y': -6.0546875}; score=-23.99972438812256
1641 (r=11/20; g=1/50); {'x': 4.9990234375, 'y': 1.6015625}; score=-23.999999046325684
1642 (r=11/20; g=1/50); {'x': 4.9990234375, 'y': 1.40625}; score=-23.999999046325684
1643 (r=11/20; g=1/50); {'x': 4.9990234375, 'y': 1.09375}; score=-23.999999046325684
1644 (r=11/20; g=1/50); {'x': 4.9990234375, 'y': 1.5625}; score=-23.999999046325684
1645 (r=11/20; g=1/50); {'x': 4.9990234375, 'y': 1.25}; score=-23.999999046325684
1646 (r=11/20; g=1/50); {'x': 4.9990234375, 'y': 1.1328125}; score=-23.999999046325684
1647 (r=11/20; g=1/50); {'x': 4.9990234375, 'y': 1.796875}; score=-23.999999046325684
1648 (r=11/20; g=1/50); {'x': 4.9990234375, 'y': 1.71875}; score=-23.999999046325684
1649 (r=11/20; g=1/50); {'x': 4.9990234375, 'y': 1.0546875}; score=-23.999999046325684
1650 (r=11/20; g=1/50); {'x': 4.9990234375, 'y': 1.3671875

> The function values of 512 points are estimated and added without calculation


{'x': 4.9990234375, 'y': -7.8125}

In [None]:
import pandas as pd
import numpy as np
import time
import xgboost as xgb
from xgboost.sklearn import XGBClassifier, XGBRegressor
import lightgbm as lgb
from lightgbm.sklearn import LGBMClassifier, LGBMRegressor
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import train_test_split
from sklearn import metrics   #Additional scklearn functions

def xgrid_search_boosting(train, test, features, target, params_schedule, model, eval_metric, 
                          CV_MODE = 'cross validation 1', N_FOLD = 5, N_BOOST_ROUND = 10000, EARLY_STOPPING = 50, RAND_SEED = 123): 
    #--- user_defined_eval_function --------------------------------------------------------------
    def user_defined_eval_function(train, test, features, target, model, eval_metric, predict_test_output = False):
#         if CV_MODE == 'validation': # validatoin approach: XGBoost
#             X_train, X_valid, y_train, y_valid = train_test_split(train[features], train[target], test_size=1/N_FOLD, random_state=RAND_SEED)
#             model.set_params(n_estimators = N_BOOST_ROUND) # initialize n_estimators
#             model.fit(X_train, y_train, eval_set = [(X_train, y_train), (X_valid, y_valid)], eval_metric = eval_metric, 
#                         early_stopping_rounds = EARLY_STOPPING, verbose = False) #Fit the algorithm on the data
#             if MIN_MAX == 'max': idx = np.array(list(model.evals_result()['validation_1'].values())).argmax()
#             elif MIN_MAX == 'min': idx = np.array(list(model.evals_result()['validation_1'].values())).argmin()
#             model.set_params(n_estimators = idx + 1) # update n_estimators
#             train_score = np.array(list(model.evals_result()['validation_0'].values())).squeeze()[idx]
#             valid_score = np.array(list(model.evals_result()['validation_1'].values())).squeeze()[idx]
#             test_pred = model.predict(test[features])     # computationally not expensive
#         elif (CV_MODE == 'cross validation 1') and ('XGB' in str(model)): # cross validation 1: XGBoost
        if (CV_MODE == 'cross validation 1') and ('XGB' in str(model)): # cross validation 1: XGBoost
            dtrain = xgb.DMatrix(train[features], label=train[target], missing=np.nan) # missing value handling: https://www.youtube.com/watch?v=cVqDguNWh4M
            cvoutp = xgb.cv(model.get_xgb_params(), dtrain, num_boost_round=N_BOOST_ROUND, verbose_eval=False,
                              nfold=N_FOLD, metrics=eval_metric, early_stopping_rounds=EARLY_STOPPING, seed=RAND_SEED) 
            model.set_params(n_estimators = cvoutp.shape[0]) # update n_estimator 
            train_score = cvoutp.tail(1)[cvoutp.columns[cvoutp.columns.str.contains('train-.+-mean', regex=True)]].squeeze()
            if eval_metric in ['rmse','error']: 
                valid_score = +cvoutp.tail(1)[cvoutp.columns[cvoutp.columns.str.contains('test-.+-mean', regex=True)]].squeeze()
            elif eval_metric in ['auc']: 
                valid_score = -cvoutp.tail(1)[cvoutp.columns[cvoutp.columns.str.contains('test-.+-mean', regex=True)]].squeeze()

            if predict_test_output == True:
                model.fit(train[features], train[target].values.ravel(), eval_metric = eval_metric) #Fit the algorithm on the data
                test_pred = model.predict(test[features])    
            else: 
                test_pred = []
        elif (CV_MODE == 'cross validation 1') and ('LGB' in str(model)): # cross validation 1: LightGBM
            dtrain = lgb.Dataset(train[features], label=train[target])

            cvoutp = lgb.cv({k:v for k,v in model.get_params().items() if k not in ['n_estimators', 'silent']}, # exclude n_estimators because of the argument, 'num_boost_round'
                            dtrain, num_boost_round=N_BOOST_ROUND, verbose_eval=False, 
                            nfold=N_FOLD, metrics=eval_metric, early_stopping_rounds=EARLY_STOPPING, seed=RAND_SEED)
            model.set_params(n_estimators = len(cvoutp[eval_metric+'-mean'])) # update n_estimator with the best num_boost_round
            if eval_metric in ['rmse','binary_error']: 
                valid_score = +cvoutp[eval_metric+'-mean'][-1] # best CV score
            elif eval_metric in ['auc']: 
                valid_score = -cvoutp[eval_metric+'-mean'][-1] # best CV score

            if predict_test_output == True:
                model.fit(train[features], train[target].values.ravel(), eval_metric = eval_metric) #Fit the algorithm on the data
                test_pred = model.predict(test[features])    
            else: 
                test_pred = []
#         elif CV_MODE == 'cross validation 2': # cross validation 2: XGBoost, LightGBM
# #             folds = StratifiedKFold(n_splits=N_FOLD, shuffle=False, random_state=RAND_SEED) # cv n-fold
#             folds = KFold(n_splits=N_FOLD, shuffle=False, random_state=RAND_SEED) # cv n-fold
#             oof = np.zeros(len(train))
#             test_pred = np.zeros(len(test))
#             for n, (trn_idx, val_idx) in enumerate(folds.split(train[features].values, train[target].values)):
#                 X_train, y_train = train.iloc[trn_idx][features], train.iloc[trn_idx][target].values.ravel()
#                 X_valid, y_valid = train.iloc[val_idx][features], train.iloc[val_idx][target].values.ravel()

#                 model.set_params(n_estimators = N_BOOST_ROUND) # initialize n_estimators
#                 model.fit(X_train, y_train, eval_set = [(X_train, y_train), (X_valid, y_valid)], eval_metric = eval_metric, 
#                             early_stopping_rounds = EARLY_STOPPING, verbose = False) #Fit the algorithm on the data

#                 if eval_metric == 'auc': 
#                     oof[val_idx] = model.predict_proba(X_valid)[:,1]
#                     test_pred += model.predict_proba(test[features])[:,1] / folds.n_splits
#                 if eval_metric == 'rmse': 
#                     oof[val_idx] = model.predict(X_valid)
#                     test_pred += model.predict(test[features]) / folds.n_splits

#             if eval_metric == 'auc': 
#                 valid_score = metrics.roc_auc_score(train[target], oof)
#             if eval_metric == 'rmse': 
#                 valid_score = np.sqrt(metrics.mean_squared_error(train[target], oof))
        return test_pred, valid_score    
    
    #--- evaluation function with xgboost, ligthgbm ------------------------------
    def xgrid_search_objective(**param): # define user function with all the input variables
        #--- START: user defined function -----------------------------
        model.set_params(**param) # update some parameters
        test_pred, valid_score = user_defined_eval_function(train, test, features, target, model, eval_metric)
        #--- END ------------------------------------------------------
        return valid_score

    #-----------------------------------------------------------------------
    tic = time.time()
    for params in params_schedule:
    #     params = left_join_crossgridparams_params(params, model.get_xgb_params()) # ensure that the latest xgmodel values are included
        best_params, grids, best_fval = xgrid_search(params, xgrid_search_objective)
        model.set_params(**best_params[0]) # update some parameters with the best so far
        print('Best Param = ',best_params[0])
        print(best_fval)    
        print(model)
    
    test_pred, valid_score = user_defined_eval_function(train, test, features, target, model, eval_metric, predict_test_output = True)
    toc = time.time()
    print('Time Elapsed = %s sec'%(toc - tic))
    print('Final Validatoin Score = ',valid_score)
    print(model) # final model confirmation
    
    return model, test_pred

In [None]:
if __DEBUG_MODE__:
    import sys
    sys.path.append("../")
    import utils

    # test dataset
    if True:
        #--- dataset for classification ------------
        np.random.seed(seed = 123)

        NN = 1000 # the number of data points
        x1 = np.random.uniform(0, 1, NN)
        x2 = np.random.uniform(0, 1, NN)
        # y = 2*(x1 - 0.5) - (x2 - 0.5) > 0 # line
        # y = x1**2 - x2 > 0 # parabolic 1
        y = 2*x1*(1-x1) - x2 > 0 # parabolic 2
        # y = ((x1 - 0.5)**2 + (x2 - 0.5)**2 - 0.3**2) > 0 # circle

        # dataset
        df = pd.DataFrame({'x1':x1, 'x2':x2, 'y':y})
        for n in range(1,0): # add uncorrelated features for test
            df['r%s'%n] = np.random.uniform(0, 1, NN)

        # train, test
        train, test = train_test_split(df, test_size = 0.2)
        target = ['y']
        features = [f for f in df.columns if f not in target]
        print(len(features))
        features
    else:
        #--- dataset for regression ----------------
        np.random.seed(seed = 123)
        x_0 = np.linspace(0,  5, num=1000, endpoint=False); r_0 = np.random.normal(0, 0.3, len(x_0))
        x_1 = np.linspace(5, 10, num=1000, endpoint=False); r_1 = np.random.normal(0, 0.3, len(x_1))
        y0 = np.concatenate((np.sin(x_0)+5, np.sin(4*x_1)))
        # y0 = np.concatenate((np.sin(x_0), np.sin(1*x_1)))

        x = np.concatenate((x_0, x_1))
        y = y0 + np.concatenate((r_0, r_1))

        df = pd.DataFrame({'x':x, 'y0':y0, 'y':y})

        # train, test
        train, test = train_test_split(df, test_size = 0.2)
        features = ['x']
        target = ['y']

    #--- LigthGBM -----------------------------------------------------------------
    eval_metric = 'auc'; lgbmodel = LGBMClassifier(learning_rate = 0.1, n_jobs = 4, random_state = 123) # classification 
    # eval_metric = 'rmse'; lgbmodel = LGBMRegressor(learning_rate = 0.1, n_jobs = 4, random_state = 123) # regression

    params_schedule = [
#         #ref) https://lightgbm.readthedocs.io/en/latest/Parameters-Tuning.html
#         #ref) https://lightgbm.readthedocs.io/en/latest/Parameters.html
#         #ref) https://www.reddit.com/r/MachineLearning/comments/aspx8x/d_methods_for_hyperparameter_tuning_with_lightgbm/
#         {'learning_rate': ('log', [0.1])},

#         {'num_leaves': ('int', [4, 16, 31, 64, 128, 256]),  # uni / log / int
#     #      'min_child_weight': ['uni', [1,50,100,200,500]]}, # min_sum_hessian_in_leaf (uni / log / int)
#          'min_child_weight': ('log', [1,100])}, # min_sum_hessian_in_leaf (uni / log / int)
#         {'min_split_gain': ('uni', [0, 1, 10, 100])}, # min_gain_to_split (uni / log / int)
#         {'subsample': ('uni', [0.2, 0.4, 0.6, 0.8, 1.0]), # bagging_fraction
#          'subsample_freq': ('int', [1]), # bagging_freq
#          'colsample_bytree': ('uni', [0.2, 0.4, 0.6, 0.8, 1.0])},  # feature_fraction (uni / log / int)
#         {'reg_alpha': ('log', [0.0001, 1, 10, 100, 1000]), # lambda_l1 
#          'reg_lambda': ('log', [0.0001, 1, 10, 100, 1000])}, # lambda_l2 (uni / log / int)

#         {'learning_rate': ('log', [0.005, 0.1, 0.2])} # uni / log / int
    ]

    lgbmodel, test_pred_lgb = xgrid_search_boosting(train, test, features, target, params_schedule, lgbmodel, eval_metric,
                                CV_MODE = 'cross validation 1', N_FOLD = 5, N_BOOST_ROUND = 10000, EARLY_STOPPING = 50, RAND_SEED = 123)
#     print('Train'); utils.eval_model_scores(lgbmodel, train, features, target)
#     print('Test'); utils.eval_model_scores(lgbmodel, test, features, target)
    
    #--- XGBoost -----------------------------------------------------------------
    eval_metric = 'auc'; xgbmodel = XGBClassifier(learning_rate = 0.1, n_jobs = 4, random_state = 123) # classification 
#     eval_metric = 'rmse'; xgbmodel = XGBRegressor(learning_rate = 0.1, n_jobs = 4, random_state = 123) # regression

    params_schedule = [
        #ref) https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/
        {'learning_rate': ('log', [0.1])},

        {'max_depth': ('int', [3, 5, 7, 9, 11]), # uni / log / int
         'min_child_weight': ('log', [1, 10, 100, 1000])}, # uni / log / int
        {'gamma': ('log', [0, 0.001, 1, 1000])}, # uni / log / int
        {'subsample': ('uni', [0.2, 0.4, 0.6, 0.8, 1.0]),  # uni / log / int
         'colsample_bytree': ('uni', [0.2, 0.4, 0.6, 0.8, 1.0])},  # uni / log / int
        {'reg_alpha': ('log', [0, 0.001, 1, 1000]),
         'reg_lambda': ('log', [1, 10, 100, 1000])},  # uni / log / int

        {'learning_rate': ('log', [0.005, 0.1, 0.2])} # uni / log / int
    ]

    xgbmodel, test_pred_xgb = xgrid_search_boosting(train, test, features, target, params_schedule, xgbmodel, eval_metric,
                                CV_MODE = 'cross validation 1', N_FOLD = 5, N_BOOST_ROUND = 10000, EARLY_STOPPING = 50, RAND_SEED = 123)
    print('Train'); utils.eval_model_scores(xgbmodel, train, features, target)
    print('Test'); utils.eval_model_scores(xgbmodel, test, features, target)
    print(utils.feature_importance_xgboost(features, xgbmodel))

In [None]:


print('> Train:'); Eval_Model_Scores(xgbmodel, train, features, target)
print('> Test:'); Eval_Model_Scores(xgbmodel, test, features, target)


In [None]:
Plot_2D_Decision_Bourndary(xgbmodel, train, test, features, target)


In [16]:
[(1,2), (3,4), (3,2), (5,4)].index((3,2))

2