In [1]:
import pandas as pd;
import numpy as np;
import lightgbm as lgb
from bayes_opt import BayesianOptimization
from sklearn.model_selection import cross_val_score
from hyperopt import STATUS_OK
from hyperopt import hp

MAX_EVALS = 500
N_FOLDS = 10



Hyperopt

Hyperopt is one of several automated hyperparameter tuning libraries using Bayesian optimization. These libraries differ in the algorithm used to both construct the surrogate (probability model) of the objective function and choose the next hyperparameters to evaluate in the objective function. Hyperopt uses the Tree Parzen Estimator (TPE). Other Python libraries include Spearmint, which uses a Gaussian process for the surrogate, and SMAC, which uses a random forest regression.

Hyperopt has a simple syntax for structuring an optimization problem which extends beyond hyperparameter tuning to any problem that involves minimizing a function. Moreover, the structure of a Bayesian Optimization problem is similar across the libraries, with the major differences coming in the syntax (and in the algorithms behind the scenes that we do not have to deal with).


In [2]:
# import Dataset to play with it
train= pd.read_csv("/home/altieris/datascience/data/santander-customer-transaction-prediction/train.csv")

In [3]:
cols=["target","ID_code"]

drop_from_shap=['var_185', 'var_181', 'var_182', 'var_183', 'var_14', 'var_41',
       'var_98', 'var_103', 'var_189', 'var_10', 'var_7', 'var_64',
       'var_65', 'var_84', 'var_73', 'var_79', 'var_96', 'var_19',
       'var_17', 'var_160', 'var_143', 'var_46', 'var_153', 'var_39',
       'var_38', 'var_37', 'var_158', 'var_136', 'var_161', 'var_42',
       'var_129', 'var_30', 'var_29', 'var_126', 'var_124', 'var_27',
       'var_117', 'var_59', 'var_100','var_168', 'var_3', 'var_8', 'var_74', 'var_72', 'var_178',
       'var_54', 'var_171', 'var_50', 'var_152', 'var_55', 'var_140',
       'var_138', 'var_193', 'var_156', 'var_113', 'var_63', 'var_15',
       'var_60', 'var_69', 'var_159', 'var_47', 'var_61', 'var_176',
       'var_68', 'var_16', 'var_57', 'var_101', 'var_25', 'var_120','var_194', 'var_11', 'var_45', 'var_196', 'var_83', 'var_20',
       'var_77', 'var_134', 'var_82', 'var_102', 'var_97', 'var_187',
       'var_142', 'var_88', 'var_62', 'var_4']

train_x = train.drop(cols,axis=1).drop(drop_from_shap,axis=1)
train_y = train["target"]




In [4]:
%%time
idx = features = train_x.columns.values[0:123]
for df in [train_x]:
    df['sum'] = df[idx].sum(axis=1)  
    df['min'] = df[idx].min(axis=1)
    df['max'] = df[idx].max(axis=1)
    df['mean'] = df[idx].mean(axis=1)
    df['std'] = df[idx].std(axis=1)
    df['skew'] = df[idx].skew(axis=1)
    df['kurt'] = df[idx].kurtosis(axis=1)
    df['med'] = df[idx].median(axis=1)

CPU times: user 6.84 s, sys: 1.85 s, total: 8.69 s
Wall time: 2.85 s


In [5]:
train_x.head()

Unnamed: 0,var_0,var_1,var_2,var_5,var_6,var_9,var_12,var_13,var_18,var_21,...,var_198,var_199,sum,min,max,mean,std,skew,kurt,med
0,8.9255,-6.7863,11.9081,-9.2834,5.1187,5.747,14.0137,0.5745,4.284,16.2191,...,12.7803,-1.0914,797.9364,-21.4494,31.4045,6.938577,9.12529,-0.150509,0.627495,6.5674
1,11.5006,-4.1473,13.8588,7.0433,5.6208,8.0851,14.0239,8.4135,7.8,2.7407,...,18.356,1.9518,894.972,-13.5901,40.5632,7.782365,8.981713,0.588345,1.959466,7.3412
2,8.6093,-2.7457,12.0805,-9.0837,6.9427,5.9525,14.1929,7.3124,4.7011,18.1377,...,14.7222,0.3965,646.3819,-22.4038,25.682,5.620712,8.191134,-0.321422,0.993937,5.6777
3,11.0604,-2.1518,8.9522,-1.8361,5.8428,8.245,13.8463,11.9704,15.9426,12.5579,...,17.9697,-8.9996,743.4463,-11.0882,24.6065,6.46475,7.150558,-0.122003,-0.072295,6.8202
4,9.8369,-1.4834,12.8746,2.4486,5.9405,7.6784,13.8481,7.8895,6.5263,18.9608,...,17.9974,-8.8104,833.1106,-39.7997,40.1236,7.24444,9.199366,-0.717864,6.281791,6.986


In [6]:
train_set = lgb.Dataset(train_x, label=train_y)

In [18]:
import csv
from hyperopt import STATUS_OK
from timeit import default_timer as timer

def objective(params, n_folds = N_FOLDS):
    """Objective function for Gradient Boosting Machine Hyperparameter Optimization"""
    
    # Keep track of evals
    global ITERATION
    
    ITERATION += 1
    
    # Retrieve the subsample if present otherwise set to 1.0
    subsample = params['boosting_type'].get('subsample', 1.0)
    
    # Extract the boosting type
    params['boosting_type'] = params['boosting_type']['boosting_type']
    params['subsample'] = subsample
    
    # Make sure parameters that need to be integers are integers
    for parameter_name in ['min_data_in_leaf','max_depth']: #'num_leaves', 'subsample_for_bin', 'min_child_samples'
        params[parameter_name] = int(params[parameter_name])
    
    start = timer()
    
    # Perform n_folds cross validation
    cv_results = lgb.cv(params, train_set, num_boost_round = 10000, nfold = n_folds, early_stopping_rounds = 100, metrics = 'auc', seed = 50)
    
    run_time = timer() - start
    
    # Extract the best score
    best_score = np.max(cv_results['auc-mean'])
    
    # Loss must be minimized
    loss = 1 - best_score
    
    # Boosting rounds that returned the highest cv score
    n_estimators = int(np.argmax(cv_results['auc-mean']) + 1)

    # Write to the csv file ('a' means append)
    of_connection = open(out_file, 'a')
    writer = csv.writer(of_connection)
    writer.writerow([loss, params, ITERATION, n_estimators, run_time])
    
    # Dictionary with information for evaluation
    return {'loss': loss, 'params': params, 'iteration': ITERATION,
            'estimators': n_estimators, 
            'train_time': run_time, 'status': STATUS_OK}

In [16]:
# Define the search space
space = {
    'boosting_type': hp.choice('boosting_type', 
                               [{'boosting_type': 'gbdt', 
                                    'subsample': hp.uniform('gdbt_subsample', 0.5, 1)}, 
                                 {'boosting_type': 'dart', 
                                     'subsample': hp.uniform('dart_subsample', 0.5, 1)},
                                 {'boosting_type': 'goss'}]),
    
    #'num_leaves': hp.quniform('num_leaves', 10, 20, 1),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.2)),
    #'subsample_for_bin': hp.quniform('subsample_for_bin', 1000, 20000, 300),
    #'min_child_samples': hp.quniform('min_child_samples', 20, 500, 5),
    #'reg_alpha': hp.uniform('reg_alpha', 0.0, 1.0),
    #'reg_lambda': hp.uniform('reg_lambda', 0.0, 1.0),
    'colsample_bytree': hp.uniform('colsample_by_tree', 0.6, 1.0),
    'max_depth': hp.quniform('max_depth', 3, 10, 1),
    'min_data_in_leaf': hp.quniform('min_data_in_leaf', 20, 100, 5), 
    'min_sum_hessian_in_leaf': hp.quniform('min_sum_hessian_in_leaf', 5, 20, 1), 
}

In [9]:
from hyperopt import tpe

# optimization algorithm
tpe_algorithm = tpe.suggest

In [10]:
from hyperopt import Trials

# Keep track of results
bayes_trials = Trials()

In [11]:
# File to save first results
out_file = 'gbm_trials.csv'
of_connection = open(out_file, 'w')
writer = csv.writer(of_connection)

# Write the headers to the file
writer.writerow(['loss', 'params', 'iteration', 'estimators', 'train_time'])
of_connection.close()

In [12]:
from hyperopt import fmin

In [19]:
%%capture

# Global variable
global  ITERATION

ITERATION = 0

# Run optimization
best = fmin(fn = objective, space = space, algo = tpe.suggest, 
            max_evals = MAX_EVALS, trials = bayes_trials, rstate = np.random.RandomState(50))

KeyboardInterrupt: 

In [None]:
# Sort the trials with lowest loss (highest AUC) first
bayes_trials_results = sorted(bayes_trials.results, key = lambda x: x['loss'])
bayes_trials_results[:2]