In [1]:
%load_ext autoreload
%autoreload 2
import warnings
warnings.simplefilter('ignore')

In [2]:
from preprocess import pack
from features import Features,feature_selection
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
import optimization
feat=Features(**pack)
data=feat.execute()
container=feature_selection(data)
trainx=container.trainx
trainy=container.trainy

In [3]:
from time import time

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import accuracy_score,mean_squared_error
from sklearn.metrics import make_scorer

from xgboost import XGBClassifier, DMatrix
from skopt import BayesSearchCV
from skopt.callbacks import DeadlineStopper, DeltaYStopper
from skopt.space import Real, Categorical, Integer

from functools import partial
import pprint
import pandas as pd
import numpy as np

In [4]:
def report_perf(optimizer, X, y, title="model", callbacks=None):
    start = time()
    if callbacks is not None:
        optimizer.fit(X, y, callback=callbacks)
    else:
        optimizer.fit(X, y)

    d = pd.DataFrame(optimizer.cv_results_)
    best_score = optimizer.best_score_
    best_score_std = d.iloc[optimizer.best_index_].std_test_score
    best_params = optimizer.best_params_

    print((title + " took %.2f seconds,  candidates checked: %d, best CV score: %.3f "+ u"\u00B1"+" %.3f") % (time() - start,len(optimizer.cv_results_['params']),best_score,best_score_std))
    print('Best parameters:')
    pprint.pprint(best_params)
    print()
    return best_params

In [25]:
model = XGBClassifier(random_state=0, booster='gbtree', tree_method='hist',eval_metric='mlogloss') 

# gpu_hist for gpu train
scoring = make_scorer(partial(accuracy_score), greater_is_better=True)
skf = StratifiedKFold(n_splits=7,shuffle=True,random_state=0)
cv_strategy = list(skf.split(trainx,trainy))


search_spaces = {'learning_rate': Real(0.01, 1.0, 'uniform'),
                 'max_depth': Integer(2, 12),
                 'subsample': Real(0.1, 1.0, 'uniform'),
                 # subsample ratio of columns by tree
                 'colsample_bytree': Real(0.1, 1.0, 'uniform'),
                 # L2 regularization
                #  'reg_lambda': Real(1e-9, 100., 'uniform'),
                #  'reg_alpha': Real(1e-9, 100., 'uniform'),  # L1 regularization
                 'n_estimators': Integer(50, 5000)
                 }


# num_class=7,
# learning_rate=0.1,
# num_iterations=1000,
# max_depth=10,
# feature_fraction=0.7, 
# scale_pos_weight=1.5,

In [26]:
opt = BayesSearchCV(estimator=model,
                    search_spaces=search_spaces,
                    scoring=scoring,
                    cv=cv_strategy,
                    n_iter=120,                                       # max number of trials
                    # number of hyperparameter sets evaluated at the same time
                    n_points=1,
                    n_jobs=1,                                  
                    # if not iid it optimizes on the cv score
                    iid=False,
                    return_train_score=False,
                    refit=False,
                    # optmizer parameters: we use Gaussian Process (GP)
                    optimizer_kwargs={'base_estimator': 'GP'},
                    random_state=0)                                   

In [28]:
overdone_control = DeltaYStopper(delta=0.0001)
time_limit_control = DeadlineStopper(total_time=60*10)
best_params = report_perf(opt, trainx,trainy, 'XGBoost_classif',callbacks=[overdone_control, time_limit_control])



XGBoost_classif took 723.18 seconds,  candidates checked: 11, best CV score: 0.549 ± 0.010
Best parameters:
OrderedDict([('colsample_bytree', 1.0),
             ('learning_rate', 0.029711244781271254),
             ('max_depth', 8),
             ('n_estimators', 2329),
             ('subsample', 0.6656766262219243)])



In [29]:
model = XGBClassifier(random_state=0, booster='gbtree', tree_method='hist',eval_metric='mlogloss', verbosity=0, **best_params)

In [30]:
folds = 10
skf = StratifiedKFold(n_splits=folds,shuffle=True,random_state=0)
rmse = list()

for k, (train_idx, val_idx) in enumerate(skf.split(trainx,trainy)):
    model.fit(trainx.iloc[train_idx].values, trainy.iloc[train_idx].values)
    val_preds = model.predict(trainx.iloc[val_idx].values)
    val_rmse = accuracy_score(y_true=trainy.iloc[val_idx].values, y_pred=val_preds)
    print(f"Fold {k} RMSE: {val_rmse:0.5f}")
    rmse.append(val_rmse)
print(f"repeated CV RMSE: {np.mean(rmse):0.5f} (std={np.std(rmse):0.5f})")

Fold 0 RMSE: 0.54041
Fold 1 RMSE: 0.56092
Fold 2 RMSE: 0.52955
Fold 3 RMSE: 0.55730
Fold 4 RMSE: 0.55609
Fold 5 RMSE: 0.56454
Fold 6 RMSE: 0.53076
Fold 7 RMSE: 0.54041
Fold 8 RMSE: 0.53076
Fold 9 RMSE: 0.57246
repeated CV RMSE: 0.54832 (std=0.01497)
