In [1]:
#hyperparameter searching

In [2]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error

%matplotlib inline

In [3]:
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

In [4]:
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
import glob
from collections import Counter

In [5]:
path = './synthetic_data/'

input_df = pd.DataFrame()
files = glob.glob(path+"*.csv")
for name in files:
    df = pd.read_csv(name) 
    input_df = pd.concat([input_df, df], ignore_index=True)

input_df = input_df.replace([np.inf, -np.inf], 0)
input_df = input_df.sample(frac=1) # we suffle the dataframe

In [6]:
y_col = 'marker'
X_cols = input_df.loc[:, input_df.columns != y_col].columns

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(input_df[X_cols], input_df[y_col],test_size=0.2, 
                                                    random_state=42)

print(' X_train shape',X_train.shape, '\n', 
      'y_train shape', y_train.shape, '\n',
      'X_test shape',X_test.shape, '\n', 
      'y_test shape', y_test.shape)

 X_train shape (1485518, 128) 
 y_train shape (1485518,) 
 X_test shape (371380, 128) 
 y_test shape (371380,)


In [7]:

class_counter = Counter(y_train)
print(' Number of items class 0:', class_counter[0], '\n',
      'Number of items class 1:', class_counter[1])
estimate_imbalancing = class_counter[0]/class_counter[1]
print('Imbalancing Factor: ',estimate_imbalancing)

 Number of items class 0: 366775 
 Number of items class 1: 1118743
Imbalancing Factor:  0.3278456267435863


In [8]:
# A parameter grid for XGBoost
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'max_depth': [3, 4, 5, 6, 7, 8],
        'booster' : ['gbtree', 'linear', 'dart'],
        }
#        'n_estimators': [500,1000,5000]
#        'subsample': [0.6, 0.8, 1.0],
#        'colsample_bytree': [0.6, 0.8, 1.0],
#

In [9]:

XGB = xgb.XGBRegressor(base_score=0.5, 
             booster='gbtree', 
             colsample_bylevel=1,
             colsample_bynode=1, 
             enable_categorical=False,
             gamma=1,
             gpu_id=-1, 
             importance_type=None,
             interaction_constraints='', 
             learning_rate=0.30000012,
             max_delta_step=0,
             monotone_constraints='()', 
             n_estimators=100,
             n_jobs=8,
             num_parallel_tree=1, 
             predictor='auto', 
             random_state=123, 
             reg_alpha=0,
             tree_method='exact',
             validate_parameters=1, verbosity=None,
             scale_pos_weight=estimate_imbalancing,
             reg_lambda=1,
)
# max_depth=6

In [10]:
folds = 2
param_comb = 4

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 123)

random_search = RandomizedSearchCV(XGB, param_distributions=params, n_iter=param_comb, scoring='roc_auc', 
                                   n_jobs=4, cv=skf.split(X_train,y_train), verbose=3, random_state=123 )

# Here we go
start_time = timer(None) # timing starts from this point for "start_time" variable
random_search.fit(X_train, y_train)
timer(start_time) # timing ends here for "start_time" variable

Fitting 2 folds for each of 4 candidates, totalling 8 fits
[CV 2/2] END booster=dart, gamma=2, max_depth=4, min_child_weight=10;, score=0.823 total time=47.3min
[CV 1/2] END booster=dart, gamma=2, max_depth=4, min_child_weight=10;, score=0.822 total time=47.3min
[CV 1/2] END booster=gbtree, gamma=1, max_depth=7, min_child_weight=1;, score=0.836 total time=38.5min
[CV 1/2] END booster=linear, gamma=1, max_depth=7, min_child_weight=1;, score=nan total time=   6.5s
[CV 1/2] END booster=gbtree, gamma=1, max_depth=6, min_child_weight=10;, score=0.834 total time=21.3min
[CV 2/2] END booster=gbtree, gamma=1, max_depth=7, min_child_weight=1;, score=0.837 total time=38.6min
[CV 2/2] END booster=linear, gamma=1, max_depth=7, min_child_weight=1;, score=nan total time=   5.6s
[CV 2/2] END booster=gbtree, gamma=1, max_depth=6, min_child_weight=10;, score=0.834 total time=21.3min

 Time taken: 1 hours 17 minutes and 21.4 seconds.


In [11]:
print('\n All results:')
print(random_search.cv_results_)
print('\n Best estimator:')
print(random_search.best_estimator_)
print('\n Best normalized gini score for %d-fold search with %d parameter combinations:' % (folds, param_comb))
print(random_search.best_score_ * 2 - 1)
print('\n Best hyperparameters:')
print(random_search.best_params_)
results = pd.DataFrame(random_search.cv_results_)
results.to_csv('xgb-random-grid-search-results-01.csv', index=False)


 All results:
{'mean_fit_time': array([2307.17403281, 2625.57491791,    6.06671453, 1276.87334037]), 'std_fit_time': array([4.90580881, 0.3549875 , 0.43678641, 1.88990068]), 'mean_score_time': array([  5.90558147, 214.74724531,   0.        ,   1.86570251]), 'std_score_time': array([1.50296164, 0.16980052, 0.        , 0.60828269]), 'param_min_child_weight': masked_array(data=[1, 10, 1, 10],
             mask=[False, False, False, False],
       fill_value='?',
            dtype=object), 'param_max_depth': masked_array(data=[7, 4, 7, 6],
             mask=[False, False, False, False],
       fill_value='?',
            dtype=object), 'param_gamma': masked_array(data=[1, 2, 1, 1],
             mask=[False, False, False, False],
       fill_value='?',
            dtype=object), 'param_booster': masked_array(data=['gbtree', 'dart', 'linear', 'gbtree'],
             mask=[False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'min_child_weight': 1, 'max_d

In [12]:
y_proba = random_search.predict(X_test)
y_pred_round = [round(value) for value in y_proba]
accuracy = accuracy_score(y_test, y_pred_round)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 74.92%


In [13]:

regressor = random_search.best_estimator_
regressor.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=1, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.30000012,
             max_delta_step=0, max_depth=7, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=8,
             num_parallel_tree=1, predictor='auto', random_state=123,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=0.3278456267435863,
             subsample=1, tree_method='exact', validate_parameters=1,
             verbosity=None)

In [14]:
y_pred = regressor.predict(X_test)
mean_squared_error(y_test, y_pred)

0.1671404

In [15]:
y_pred_round = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, y_pred_round)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 74.92%


In [16]:
#EOF