### XGB Training and Optimization

In [None]:
# ------------ Optimize XGB based on AUC ------------
if doOptimization:
    # Parse max depth
    parser = argparse.ArgumentParser('Simple xgboost training.')
    parser.add_argument('-md', '--md', type=int, help='BDT max depth')
    args = parser.parse_args()

    print("Doing Optimization for maxDepth ", args.md)

    # Extract features (x) and labels (y)
    x_test  = df_test [ FEATURES ].copy()
    x_train = df_train[ FEATURES ].copy()
    y_test  = df_test ['class']
    y_train = df_train['class']

    # Define DMatrix for training and testing
    dtest_clf  = xgb.DMatrix(x_test , y_test , enable_categorical=True)
    dtrain_clf = xgb.DMatrix(x_train, y_train, enable_categorical=True)

    # Reduced grid  args.md
    max_depth        = np.arange(args.md, args.md+1, 1)
    min_child_weight = np.array([2,3,4,5])
    subsample        = np.array([0.8, 0.9, 1])
    colsample_bytree = np.array([0.8, 0.9, 1])
    eta              = np.array([0.1])

    # Build np arrays to fill the grid
    # Methods:
    #  - tile()   repeats the array N times
    #  - repeat() repeats the individual elements
    # Algo:
    #  - repeat -> times the params below
    #  - tile   -> times the params above
    max_depths        = np.repeat(max_depth         , len(min_child_weight)*len(subsample)*len(colsample_bytree)*len(eta)      )
    min_child_weights = np.repeat(min_child_weight  , len(subsample)*len(colsample_bytree)*len(eta)                            )
    min_child_weights = np.tile  (min_child_weights , len(max_depth)                                                           )
    subsamples        = np.repeat(subsample         , len(colsample_bytree)*len(eta)                                           )
    subsamples        = np.tile  (subsamples        , len(max_depth)*len(min_child_weight)                                     )
    colsample_bytrees = np.repeat(colsample_bytree  , len(eta)                                                                 )
    colsample_bytrees = np.tile  (colsample_bytrees , len(max_depth)*len(min_child_weight)*len(subsample)                      )
    etas              = np.tile  (eta               , len(max_depth)*len(min_child_weight)*len(subsample)*len(colsample_bytree))

    # Define the grid as Pandas DF
    grid = pd.DataFrame(
        {
            'max_depth'        : list(max_depths),
            'min_child_weight' : list(min_child_weights),
            'subsample'        : list(subsamples),
            'colsample_bytree' : list(colsample_bytrees),
            'eta'              : list(etas)
        }
    )

    # Print points to be scanned
    print('Probing a grid of shape:', grid.shape)
    print('- Parameters:')
    print('  - max_depth        :', max_depth)
    print('  - min_child_weight :', min_child_weight)
    print('  - subsample        :', subsample)
    print('  - colsample_bytree :', colsample_bytree)
    print('  - eta              :', eta)

    # Define npoints just for printing progress of optimization
    npoints = int(grid.shape[0]/8)
    if npoints < 10: npoints = 1

    # Function to apply the xgb CrossValidation for each grid point
    def fit(x):
        # Set parameters for each grid point
        # https://xgboost.readthedocs.io/en/latest/parameter.html#parameters-for-tree-booster
        params = {
            'booster'          : 'gbtree',
            'objective'        : 'binary:logistic',
            'eval_metric'      : 'auc',
            'tree_method'      : 'hist',
            'sampling_method'  : 'uniform',
            'max_depth'        : int(x[0]),
            'min_child_weight' : int(x[1]),
            'subsample'        : x[2],
            'colsample_bytree' : x[3],
            'eta'              : x[4],
        }

        # Run CrossValidation for each grid-point
        # https://xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.cv
        xgb_cv = xgb.cv(
            dtrain                = dtrain_clf,
            params                = params, 
            nfold                 = 5,
            metrics               = 'auc',
            seed                  = 42,
            num_boost_round       = 999, # set large value
            early_stopping_rounds = 10   # let early stopping select the round (i.e. the number of trees)
        )

        # Return values of from best round of boosting
        best_round = np.array([xgb_cv['test-auc-mean'].argmax()])
        res = np.concatenate((best_round, xgb_cv[-1:].values[0]), axis=None)
        if int(x._name) % npoints == 0:
            print('- Done Grid point:', x._name, 'Params:', x['max_depth'], x['min_child_weight'], x['subsample'], x['colsample_bytree'], x['eta'])
            print('    nTrees:', res[0], ' Test AUC:', res[3])
        return res

    # Extend the grid to include xgb results for each point
    print("Start Grid Search:", datetime.now())
    grid[['rounds', 'train-auc-mean', 'train-auc-std', 'test-auc-mean', 'test-auc-std']] = grid.apply(
        fit,                   # Fuction to be applied
        axis = 1,              # Apply to each row
        result_type = 'expand' # List-like results will be turned into columns
    )
    print("End Grid Search:", datetime.now())

    # Print model with highest test-AUC
    print('** test-auc-mean **\n', grid.iloc[grid['test-auc-mean'].idxmax()])


# ------------ Retrain model with optimized parameters ------------
if doFinalFit:
    print("Doint final fit")

    # Extract features (x) and labels (y)
    x_test  = df_test [ FEATURES ].copy()
    x_train = df_train[ FEATURES ].copy()
    y_test  = df_test ['class']
    y_train = df_train['class']

    # Define DMatrix for training and testing
    dtest_clf  = xgb.DMatrix(x_test , y_test , enable_categorical=True)
    dtrain_clf = xgb.DMatrix(x_train, y_train, enable_categorical=True)

    # Best params from optimization
    print("Using best parameters:")
    params = {
        'max_depth'       : 3,
        'min_child_weight': 5,
        'subsample'       : 1,
        'colsample_bytree': 0.6,
        'eta'             : 0.1,
        'eval_metric'     : 'auc',
        'objective'       : 'binary:logistic'
    }
    print(params)

    # Define large early stopping callback returning best model
    early_stop = xgb.callback.EarlyStopping(rounds=10, save_best=True)

    # Train classifier
    xgb_classifier = xgb.train(
        params,
        dtrain_clf,
        evals=[(dtest_clf, "Test")],
        num_boost_round=999,
        callbacks=[early_stop]
    )

    # Classifier name
    outname = 'updated_results/3_finalFit/xgbV175_binary_featV8_finalFit_v1.json'

    # Save model and config
    xgb_classifier.save_model(outname)
    with open(outname.replace('.json','_cfg.json'), 'w', encoding='utf-8') as fout:
        json.dump(xgb_classifier.save_config(), fout, ensure_ascii=False, indent=4)

    # Run prediction
    pred = xgb_classifier.predict(dtest_clf)

    # Print model results
    print('  - Model          :', outname)
    print('    ROC AUC score  :', sk.metrics.roc_auc_score (y_test, pred))
    print('    nTrees         :', xgb_classifier.num_boosted_rounds())

    # Feature importances
    f_gain   = xgb_classifier.get_score(importance_type="gain"  )
    f_weight = xgb_classifier.get_score(importance_type="weight")
    f_cover  = xgb_classifier.get_score(importance_type="cover" )

    k_gain   = list(f_gain  .keys()  )
    k_weight = list(f_weight.keys()  )
    k_cover  = list(f_cover .keys()  )
    v_gain   = list(f_gain  .values())
    v_weight = list(f_weight.values())
    v_cover  = list(f_cover .values())

    data_gain   = pd.DataFrame(data=v_gain  , index=k_gain  , columns=["score"]).sort_values(by="score", ascending=False)
    data_weight = pd.DataFrame(data=v_weight, index=k_weight, columns=["score"]).sort_values(by="score", ascending=False)
    data_cover  = pd.DataFrame(data=v_cover , index=k_cover , columns=["score"]).sort_values(by="score", ascending=False)

    fig = plt.figure(figsize=(10, 7), dpi=100)
    data_gain.nlargest(40, columns="score").plot(kind='barh')
    plt.title('XGB Feature Importance by Gain')
    plt.xlabel('Score')
    plt.savefig(outname.replace('.json', '_FeatImp_Gain.png'))
    plt.close(fig)

    fig = plt.figure(figsize=(10, 7), dpi=100)
    data_weight.nlargest(40, columns="score").plot(kind='barh')
    plt.title('XGB Feature Importance by Weight')
    plt.xlabel('Score')
    plt.savefig(outname.replace('.json', '_FeatImp_Weight.png'))
    plt.close(fig)

    fig = plt.figure(figsize=(10, 7), dpi=100)
    data_cover.nlargest(40, columns="score").plot(kind='barh')
    plt.title('XGB Feature Importance by Cover')
    plt.xlabel('Score')
    plt.savefig(outname.replace('.json', '_FeatImp_Cover.png'))
    plt.close(fig)
