# Bayesian hyperparameter optimisation

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from bayes_opt import BayesianOptimization
import xgboost as xgb
import gc
import re
#from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [4]:
path = "/vols/build/cms/akd116/MLStudies/local/"
X = pd.read_hdf(f'{path}/data_tauspinner_20Apr2020_2018/dataset_fold0_sm_tt_2018.hdf5')

In [5]:
X.dropna(inplace=True)

X = X[X["multi_class"] != "misc"]
X["multi_class"].replace("qqh","ggh",inplace=True)
X["multi_class"].replace("vh","ggh",inplace=True)
X["multi_class"].replace("ggh","higgs",inplace=True)

# for purpose of hyperparameter tuning, use CV function later
# use full fold for this, not train_test_split
X_train = X
y_train = X["multi_class"]
w_train = X["wt_xs"]

print(X_train[(X_train.multi_class == 'higgs')].shape)
del X
gc.collect()

(703672, 20)


262

In [6]:
sum_w = X_train['wt_xs'].sum()
sum_w_cat = X_train.groupby('multi_class')['wt_xs'].sum()
class_weights = sum_w / sum_w_cat

class_weight_dict = dict(class_weights)

print(class_weight_dict)

# multiply w_train by class_weight now
# add mjj dependent weight for ggH

for i in w_train.index:
    for key, value in class_weight_dict.items():
        if y_train[i] == key:
            w_train.at[i] *= value

sum_w_cat_after = X_train.groupby('multi_class')['wt_xs'].sum()
print(sum_w_cat_after)

{'higgs': 280.7979212687279, 'jetFakes': 1.2381128294841357, 'ztt_embed': 5.297791839187975}
multi_class
higgs        140583.167839
jetFakes     140583.167839
ztt_embed    140583.167839
Name: wt_xs, dtype: float64


In [7]:
## use one-hot encoding
# encode class values as integers
encoder_train = LabelEncoder()
encoder_train.fit(y_train)

y_train = encoder_train.transform(y_train)

display(X_train.head(5))

dropVars = ["wt","wt_xs", "process", "multi_class","event","gen_match_1", "gen_match_2",]
dropVars.append("wt_cp_sm")
dropVars.append("wt_cp_ps")
dropVars.append("wt_ph_nnlops")

X_train = X_train.drop(dropVars, axis=1).reset_index(drop=True)

# to use names "f0" etcs
#print(X_train.columns)
#orig_columns = X_train.columns
#X_train.columns = ["f{}".format(x) for x in np.arange(X_train.shape[1])]
#print(X_train.columns)

Unnamed: 0,event,gen_match_1,gen_match_2,jdeta,jpt_1,m_vis,met,mjj,multi_class,n_jets,process,pt_1,pt_tt,pt_vis,svfit_mass,wt,wt_cp_ps,wt_cp_sm,wt_ph_nnlops,wt_xs
1,543514,5,5,1.80808,123.065634,105.019546,201.573943,184.500941,higgs,2,WplusHToTauTauUncorrelatedDecay_Filtered,67.395817,180.302066,44.251234,249.24234,0.886234,0.737991,1.235461,1.0,0.088991
2,676679852,32766,306349108,-9999.0,-9999.0,168.07773,12.856351,-9999.0,jetFakes,0,TauD,94.824577,8.375764,20.81305,190.162834,0.279605,0.0,0.0,1.0,0.346183
3,81428506,5,5,3.15234,386.12522,93.386927,92.209754,551.044926,higgs,3,GluGluHToTauTauUncorrelatedDecay_Filtered,166.946978,399.544256,307.790879,116.494202,1.221447,0.46054,1.322198,0.434325,0.437226
4,58891302,5,5,-9999.0,-9999.0,93.309634,5.676605,-9999.0,higgs,0,GluGluHToTauTauUncorrelatedDecay_Filtered,49.122306,12.169055,7.326344,110.91531,0.926597,1.779434,0.260535,1.276106,0.192027
8,162302500,32767,2798793780,-9999.0,46.519429,103.453547,2.963086,-9999.0,jetFakes,1,TauB,52.928593,5.677488,4.177655,114.194538,0.074121,0.0,0.0,1.0,0.091771


In [8]:
# Capture stderr and stdout using the function below
import contextlib

@contextlib.contextmanager
def capture():
    import sys
    from io import StringIO
    olderr, oldout = sys.stderr, sys.stdout
    try:
        out = [StringIO(), StringIO()]
        sys.stderr, sys.stdout = out
        yield out
    finally:
        sys.stderr, sys.stdout = olderr, oldout
        out[0] = out[0].getvalue().splitlines()
        out[1] = out[1].getvalue().splitlines()

In [9]:
# Callable to be passed to BayesianOptimisation

def xgb_cv(
    learning_rate, gamma, min_child_weight, subsample,
    colsample_bytree, reg_lambda, reg_alpha,
    #max_depth, max_delta_step,
):
    paramt = {
        'gamma': gamma,
        'booster': 'gbtree',
        'max_depth': 4, #max_depth.astype(int),
        'learning_rate': learning_rate,
        'objective': 'multi:softprob',
        'nthread': -1,
        'silent': 1,
        'reg_lambda': reg_lambda,
        'reg_alpha': reg_alpha,
        'subsample': subsample,
        'colsample_bytree': colsample_bytree,
        'min_child_weight': min_child_weight,
        'max_delta_step': 2, #max_delta_step.astype(int),
        'num_class': 3,
        'seed': 123456,
    }

    folds = 2
    cv_score = 0

    print(" Search parameters ({}-fold validation):\n {}".format(folds, paramt),
          file=log_file)
    log_file.flush()

    # Do not optimize the number of boosting rounds, as early stopping will take care of that

    with capture() as result:
        res = xgb.cv(
            paramt,
            dtrain,
            num_boost_round=20000,
            stratified=True,
            nfold=folds,
            verbose_eval=1,
            early_stopping_rounds=20,
            metrics="mlogloss",
            seed=123456,
        )
        display(res)
        
# All relevant things in XGboost output are in stdout, so we screen result[1]
# for a line with "cv-mean". This line signifies the end of output and contains CV values.
# Next we split the line to extract CV values. We also print the whole CV run into file
# In previous XGboost the output was in stderr, in which case we would need result[0]

    
    print('', file=log_file)
    for line in result[1]:
        print(line, file=log_file)
        if str(line).find('test-mlogloss-mean'):
            cv_score = float(re.split('[|]| |\t|:', line)[4].split("+")[0])
    log_file.flush()

    # The CV metrics function in XGboost can be lots of things. Some of them need to be maximized, like AUC.
    # If the metrics needs to be minimized, e.g, logloss, the return line below should be a negative number
    # as Bayesian Optimizer only knows how to maximize the function
    print(-1 * cv_score)
    return (-1.0 * cv_score)

In [10]:
dtrain = xgb.DMatrix(X_train, label=y_train, weight=w_train)

In [11]:
dtrain.feature_names

['jdeta',
 'jpt_1',
 'm_vis',
 'met',
 'mjj',
 'n_jets',
 'pt_1',
 'pt_tt',
 'pt_vis',
 'svfit_mass']

In [None]:
log_file = open("bayes_opt_2018_fold0_fixMaxDepthDeltaStep_second.txt", 'a')
#opt_kwargs = {
#    'max_depth': (3, 6),'learning_rate': (0.01, 1.),
#    'gamma': (0.0001, 2.), 'min_child_weight': (1, 500),
#    'max_delta_step': (0, 5), 'subsample': (0.2, 1.),
#    'colsample_bytree': (0.2, 1.),
#    'reg_lambda': (0., 5.), 'reg_alpha': (0., 5.), 
#}
#opt_kwargs = {
#    'max_depth': (3.5, 5.5),'learning_rate': (0.05, 0.5),
#    'gamma': (0.1, 2.), 'min_child_weight': (300, 600),
#    'max_delta_step': (0, 3), 'subsample': (0.6, 1.),
#    'colsample_bytree': (0.6, 1.),
#    'reg_lambda': (0., 5.), 'reg_alpha': (0., 2.), 
#}
#opt_kwargs = {
#    'learning_rate': (0.05, 0.1), 'gamma': (0.1, 2.), 'min_child_weight': (250, 400),
#    'subsample': (0.6, 1.), 'colsample_bytree': (0.6, 1.),
#    'reg_lambda': (0., 5.), 'reg_alpha': (0., 2.), 
#}
opt_kwargs = {
    'learning_rate': (0.05, 0.09), 'gamma': (0.1, 1.2), 'min_child_weight': (100, 400),
    'subsample': (0.6, 0.9), 'colsample_bytree': (0.6, 0.9),
    'reg_lambda': (0., 2.), 'reg_alpha': (0., 0.4), 
}
xgb_bo = BayesianOptimization(xgb_cv, opt_kwargs)
print('\n', file=log_file)
log_file.flush()

print('Running Bayesian Optimization ...\n')
xgb_bo.maximize(init_points=5, n_iter=5)

print('\nFinal Results', file=log_file)
print('XGBOOST: {}'.format(xgb_bo.max), file=log_file)
log_file.flush()
log_file.close()

Running Bayesian Optimization ...

|   iter    |  target   | colsam... |   gamma   | learni... | min_ch... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------


Unnamed: 0,train-mlogloss-mean,train-mlogloss-std,test-mlogloss-mean,test-mlogloss-std
0,1.071477,0.000929,1.071604,0.001138
1,1.045098,0.000363,1.045334,0.000758
2,1.021434,0.000974,1.021748,0.001574
3,1.000641,0.001389,1.001031,0.002097
4,0.980093,0.001495,0.980568,0.002341
...,...,...,...,...
1130,0.671835,0.002015,0.690962,0.002150
1131,0.671817,0.002018,0.690961,0.002148
1132,0.671799,0.002018,0.690959,0.002147
1133,0.671789,0.002017,0.690959,0.002148


-0.69096
| [0m 1       [0m | [0m-0.691   [0m | [0m 0.8413  [0m | [0m 0.6117  [0m | [0m 0.06428 [0m | [0m 258.3   [0m | [0m 0.1231  [0m | [0m 1.555   [0m | [0m 0.7564  [0m |


Unnamed: 0,train-mlogloss-mean,train-mlogloss-std,test-mlogloss-mean,test-mlogloss-std
0,1.076763,0.000768,1.076866,0.000926
1,1.055173,0.000304,1.055352,0.000628
2,1.035410,0.000855,1.035654,0.001357
3,1.017737,0.001208,1.018040,0.001800
4,1.000015,0.001372,1.000401,0.002052
...,...,...,...,...
1432,0.672215,0.002038,0.691047,0.002052
1433,0.672203,0.002039,0.691046,0.002051
1434,0.672193,0.002039,0.691042,0.002051
1435,0.672186,0.002040,0.691043,0.002053


-0.69105
| [0m 2       [0m | [0m-0.6911  [0m | [0m 0.8538  [0m | [0m 1.137   [0m | [0m 0.05147 [0m | [0m 259.7   [0m | [0m 0.2492  [0m | [0m 1.941   [0m | [0m 0.678   [0m |


Unnamed: 0,train-mlogloss-mean,train-mlogloss-std,test-mlogloss-mean,test-mlogloss-std
0,1.074885,0.000561,1.075044,0.000699
1,1.059858,0.007863,1.060057,0.008152
2,1.038315,0.007618,1.038581,0.008076
3,1.024514,0.008730,1.024858,0.009310
4,1.004846,0.008568,1.005303,0.009240
...,...,...,...,...
1204,0.671177,0.001957,0.691393,0.002198
1205,0.671164,0.001953,0.691394,0.002200
1206,0.671151,0.001953,0.691391,0.002198
1207,0.671142,0.001951,0.691387,0.002199


-0.69139
| [0m 3       [0m | [0m-0.6914  [0m | [0m 0.6178  [0m | [0m 0.4435  [0m | [0m 0.05694 [0m | [0m 176.2   [0m | [0m 0.09412 [0m | [0m 1.94    [0m | [0m 0.8921  [0m |


Unnamed: 0,train-mlogloss-mean,train-mlogloss-std,test-mlogloss-mean,test-mlogloss-std
0,1.064186,0.000873,1.064364,0.001063
1,1.043171,0.011077,1.043401,0.011490
2,1.013759,0.010337,1.014038,0.010980
3,0.995603,0.011728,0.995997,0.012536
4,0.970180,0.011173,0.970689,0.012098
...,...,...,...,...
1028,0.673135,0.002042,0.691674,0.002060
1029,0.673114,0.002042,0.691677,0.002057
1030,0.673102,0.002041,0.691674,0.002055
1031,0.673086,0.002043,0.691671,0.002053


-0.69168
| [0m 4       [0m | [0m-0.6917  [0m | [0m 0.635   [0m | [0m 0.8502  [0m | [0m 0.08371 [0m | [0m 369.3   [0m | [0m 0.3292  [0m | [0m 1.518   [0m | [0m 0.6995  [0m |


Unnamed: 0,train-mlogloss-mean,train-mlogloss-std,test-mlogloss-mean,test-mlogloss-std
0,1.068351,0.001018,1.068504,0.001251
1,1.039264,0.000378,1.039532,0.000792
2,1.013562,0.000972,1.013944,0.001633
3,0.991181,0.001314,0.991661,0.002086
4,0.969213,0.001446,0.969745,0.002332
...,...,...,...,...
1039,0.673202,0.001904,0.691223,0.002240
1040,0.673185,0.001903,0.691226,0.002234
1041,0.673172,0.001906,0.691220,0.002234
1042,0.673153,0.001905,0.691219,0.002237


-0.69122
| [0m 5       [0m | [0m-0.6912  [0m | [0m 0.8419  [0m | [0m 0.1178  [0m | [0m 0.07187 [0m | [0m 360.4   [0m | [0m 0.2507  [0m | [0m 0.561   [0m | [0m 0.8789  [0m |


Unnamed: 0,train-mlogloss-mean,train-mlogloss-std,test-mlogloss-mean,test-mlogloss-std
0,1.064947,0.000846,1.065122,0.001048
1,1.044272,0.010854,1.044500,0.011240
2,1.015296,0.010118,1.015572,0.010753
3,0.997329,0.011468,0.997744,0.012261
4,0.972238,0.010955,0.972776,0.011858
...,...,...,...,...
768,0.670721,0.001993,0.691580,0.002074
769,0.670704,0.002003,0.691576,0.002071
770,0.670683,0.002004,0.691575,0.002077
771,0.670660,0.002005,0.691572,0.002083


-0.69159
| [0m 6       [0m | [0m-0.6916  [0m | [0m 0.6492  [0m | [0m 0.1091  [0m | [0m 0.08169 [0m | [0m 100.0   [0m | [0m 0.07477 [0m | [0m 1.157   [0m | [0m 0.6793  [0m |


Unnamed: 0,train-mlogloss-mean,train-mlogloss-std,test-mlogloss-mean,test-mlogloss-std
0,1.072419,0.000912,1.072526,0.001111
1,1.046911,0.000375,1.047118,0.000758
2,1.023962,0.001017,1.024221,0.001576
3,1.003686,0.001367,1.004050,0.002074
4,0.983605,0.001558,0.984022,0.002343
...,...,...,...,...
1128,0.675218,0.002076,0.691503,0.002071
1129,0.675210,0.002069,0.691499,0.002071
1130,0.675191,0.002068,0.691499,0.002072
1131,0.675179,0.002071,0.691494,0.002070


-0.6915
| [0m 7       [0m | [0m-0.6915  [0m | [0m 0.8516  [0m | [0m 0.7945  [0m | [0m 0.06204 [0m | [0m 400.0   [0m | [0m 0.3585  [0m | [0m 1.965   [0m | [0m 0.7306  [0m |


Unnamed: 0,train-mlogloss-mean,train-mlogloss-std,test-mlogloss-mean,test-mlogloss-std
0,1.073277,0.000865,1.073416,0.001045
1,1.048471,0.000321,1.048731,0.000697
2,1.026159,0.000872,1.026509,0.001408
3,1.006403,0.001210,1.006842,0.001854
4,0.986719,0.001414,0.987230,0.002153
...,...,...,...,...
1066,0.669245,0.002051,0.691265,0.002094
1067,0.669233,0.002049,0.691263,0.002092
1068,0.669216,0.002040,0.691259,0.002097
1069,0.669202,0.002034,0.691256,0.002101


-0.69126
| [0m 8       [0m | [0m-0.6913  [0m | [0m 0.8114  [0m | [0m 0.8711  [0m | [0m 0.05982 [0m | [0m 100.0   [0m | [0m 0.06018 [0m | [0m 0.7071  [0m | [0m 0.8962  [0m |


Unnamed: 0,train-mlogloss-mean,train-mlogloss-std,test-mlogloss-mean,test-mlogloss-std
0,1.071852,0.000624,1.072014,0.000794
1,1.055016,0.008776,1.055252,0.009074
2,1.031123,0.008401,1.031408,0.008877
3,1.015946,0.009592,1.016339,0.010216
4,0.994486,0.009336,0.994990,0.010076
...,...,...,...,...
1073,0.668805,0.002037,0.691535,0.002201
1074,0.668797,0.002034,0.691534,0.002203
1075,0.668786,0.002038,0.691534,0.002203
1076,0.668767,0.002029,0.691529,0.002205


-0.69154
| [0m 9       [0m | [0m-0.6915  [0m | [0m 0.6409  [0m | [0m 0.9077  [0m | [0m 0.06443 [0m | [0m 100.1   [0m | [0m 0.1849  [0m | [0m 0.4647  [0m | [0m 0.8993  [0m |


## Optimised hyperparameters

In [1]:
params_optimum = {
    'gamma': 0.6116681199107202, 'booster': 'gbtree', 
    'max_depth': 4, 'learning_rate': 0.06427997105779841, 
    'objective': 'multi:softprob', 'nthread': -1, 'silent': 1, 
    'reg_lambda': 1.5550976281683702, 'reg_alpha': 0.12309703586128405, 
    'subsample': 0.7563612725885316, 'colsample_bytree': 0.8412848980537144, 
    'min_child_weight': 258.320681582356, 'max_delta_step': 2, 
    'num_class': 3, 'seed': 123456
}