# XGBoost Classifier Tuning

XGBoost was selected because.....

In [30]:
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.metrics import recall_score
import xgboost as xgb
import time

import sys
sys.path.insert(0, '../src')
import helpers as h

SEED = 123
holdoutseed = 4
CPU = -1
FOLDS = 5

The native API for XGBoost was used instead of the klearn version because it:
1. Automatically finds the best number of boosting rounds
2. Has build in cross validation


In [9]:
X_train, X_test, y_train, y_test, \
c_train, c_test, X_holdout, y_holdout, c_holdout, \
features = h.load_data(holdoutseed, engineered_features=False)

#*********** CHANGE TO VER 04 **************
classifiers = h.load_classifier_dict("classifiers_ver04.pickle")

In [31]:
# Instead of numpy arrays or pandas dataframes, XGB used DMatrix
ss = preprocessing.StandardScaler()
X_train = ss.fit_transform(X_train)
sm = SMOTE(random_state=SEED, n_jobs=CPU, sampling_strategy=classifiers["XGB"]['SMOTE_bestparm'])
X_resample, y_resample = sm.fit_resample(X_train, y_train)

dtrain = xgb.DMatrix(X_resample, label=y_resample)
dtest = xgb.DMatrix(X_test, label=y_test)

In [19]:
params = {
    # Parameters that we are going to tune.
#     'max_depth':6,
#     'min_child_weight': 1,
#     'eta':.3,
#     'subsample': 1,
#     'colsample_bytree': 1
    # Other parameters
#    'objective':'reg:linear',
}

# use Area Under Precision-Recall Curve for eval metric
params['eval_metric'] = "aucpr"


In [None]:
# early stopping rounds is number of rounds to stop if no improvement

num_boost_round = 999
model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10,
    seed=SEED,
    nfold=5
)

cv_results = xgb.cv(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    seed=42,
    nfold=5,
    metrics={'mae'},
    early_stopping_rounds=10
)
cv_results['test-aucpr-mean'].min()

In [None]:
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(9,12)
    for min_child_weight in range(5,8)
]

In [None]:
# Define initial best params and MAE
min_mae = float("Inf")
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))
    # Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'mae'},
        early_stopping_rounds=10
    )
    # Update best MAE
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (max_depth,min_child_weight)
print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))


In [32]:
max_aucpr = 0
gridsearch_params = [max_depth for max_depth in range(2,10)]
for max_depth in gridsearch_params:
    start_time = time.time()
    params['max_depth'] = max_depth
    cv_results = xgb.cv(params,
                        dtrain,
                        num_boost_round=999,
                        seed=42,
                        nfold=5,
                        metrics={'aucpr'},
                        early_stopping_rounds=10)
    mean_aucpr = cv_results['test-aucpr-mean'].max()
    boost_rounds = cv_results['test-aucpr-mean'].argmax()
    print(f"\aucpr {mean_aucpr} for {boost_rounds} rounds")
    if mean_aucpr > max_aucpr:
        max_aucpr = mean_aucpr
        best_params = (max_depth)

    t = time.time() - start_time
    print(f"{t:.0f} seconds execution time for max depth {max_depth}")

print(f"Best params: {best_params}, AUCPR: {max_mae}")

ucpr 0.8921470000000001 for 0 rounds
574 seconds execution time for max depth 2
ucpr 0.9222286000000001 for 0 rounds
227 seconds execution time for max depth 3
ucpr 0.9375062 for 0 rounds
364 seconds execution time for max depth 4


KeyboardInterrupt: 

In [38]:
# Define initial best params and MAE
max_max = 0
best_params = None
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(2,5)
    for min_child_weight in range(5,8)
]

for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))
    # Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=999,
        seed=42,
        nfold=5,
        metrics={'aucpr'},
        early_stopping_rounds=10
    )
    # Update best MAE
    mean_aucpr = cv_results['test-aucpr-mean'].max()
    boost_rounds = cv_results['test-aucpr-mean'].argmax()
    print("\tAUCPR {} for {} rounds".format(mean_aucpr, boost_rounds))

    if mean_aucpr > max_aucpr:
        max_aucpr = mean_aucpr
        best_params = (max_depth,min_child_weight)
print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_aucpr))

CV with max_depth=2, min_child_weight=5
	AUCPR 0.9971800000000002 for 998 rounds
CV with max_depth=2, min_child_weight=6
	AUCPR 0.9955008 for 578 rounds
CV with max_depth=2, min_child_weight=7
	AUCPR 0.9971038 for 998 rounds
CV with max_depth=3, min_child_weight=5
	AUCPR 0.9969802 for 266 rounds
CV with max_depth=3, min_child_weight=6
	AUCPR 0.9978226000000001 for 398 rounds
CV with max_depth=3, min_child_weight=7
	AUCPR 0.9984132000000001 for 614 rounds
CV with max_depth=4, min_child_weight=5
	AUCPR 0.9993058000000001 for 336 rounds
CV with max_depth=4, min_child_weight=6
	AUCPR 0.9991199999999999 for 277 rounds
CV with max_depth=4, min_child_weight=7
	AUCPR 0.999216 for 346 rounds


TypeError: 'NoneType' object is not subscriptable